Skip to content

Commit 8ec2ec8

Browse files
committed
feat(RHOAIENG-26590): Report RayJob status via SDK
Signed-off-by: Pat O'Connor <[email protected]>
1 parent 0ac44ed commit 8ec2ec8

20 files changed

+1689
-665
lines changed

poetry.lock

Lines changed: 701 additions & 661 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,14 @@ cryptography = "43.0.3"
2929
executing = "1.2.0"
3030
pydantic = "< 2"
3131
ipywidgets = "8.1.2"
32+
odh-kuberay-client = {version = "0.0.0.dev40", source = "testpypi"}
33+
34+
[[tool.poetry.source]]
35+
name = "pypi"
36+
37+
[[tool.poetry.source]]
38+
name = "testpypi"
39+
url = "https://test.pypi.org/simple/"
3240

3341
[tool.poetry.group.docs]
3442
optional = true

src/codeflare_sdk/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
AWManager,
1111
AppWrapperStatus,
1212
RayJobClient,
13+
RayJob,
1314
)
1415

1516
from .common.widgets import view_clusters

src/codeflare_sdk/ray/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@
44
RayJobClient,
55
)
66

7+
from .rayjobs import (
8+
RayJob,
9+
RayJobDeploymentStatus,
10+
CodeflareRayJobStatus,
11+
RayJobInfo,
12+
)
13+
714
from .cluster import (
815
Cluster,
916
ClusterConfiguration,

src/codeflare_sdk/ray/cluster/build_ray_cluster.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ def build_ray_cluster(cluster: "codeflare_sdk.ray.cluster.Cluster"):
136136
"enableIngress": False,
137137
"rayStartParams": {
138138
"dashboard-host": "0.0.0.0",
139+
"dashboard-port": "8265",
139140
"block": "true",
140141
"num-gpus": str(head_gpu_count),
141142
"resources": head_resources,
@@ -245,6 +246,7 @@ def get_labels(cluster: "codeflare_sdk.ray.cluster.Cluster"):
245246
"""
246247
labels = {
247248
"controller-tools.k8s.io": "1.0",
249+
"ray.io/cluster": cluster.config.name, # Enforced label always present
248250
}
249251
if cluster.config.labels != {}:
250252
labels.update(cluster.config.labels)

src/codeflare_sdk/ray/cluster/cluster.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,12 @@
2020

2121
from time import sleep
2222
from typing import List, Optional, Tuple, Dict
23+
import copy
2324

24-
from ray.job_submission import JobSubmissionClient
25+
from ray.job_submission import JobSubmissionClient, JobStatus
26+
import time
27+
import uuid
28+
import warnings
2529

2630
from ...common.kubernetes_cluster.auth import (
2731
config_check,
@@ -57,7 +61,6 @@
5761
from kubernetes.client.rest import ApiException
5862

5963
from kubernetes.client.rest import ApiException
60-
import warnings
6164

6265
CF_SDK_FIELD_MANAGER = "codeflare-sdk"
6366

@@ -760,6 +763,7 @@ def get_cluster(
760763
head_extended_resource_requests=head_extended_resources,
761764
worker_extended_resource_requests=worker_extended_resources,
762765
)
766+
763767
# Ignore the warning here for the lack of a ClusterConfiguration
764768
with warnings.catch_warnings():
765769
warnings.filterwarnings(

src/codeflare_sdk/ray/cluster/test_cluster.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -758,5 +758,11 @@ def custom_side_effect(group, version, namespace, plural, **kwargs):
758758

759759
# Make sure to always keep this function last
760760
def test_cleanup():
761-
os.remove(f"{aw_dir}test-all-params.yaml")
762-
os.remove(f"{aw_dir}aw-all-params.yaml")
761+
# Remove files only if they exist
762+
test_file = f"{aw_dir}test-all-params.yaml"
763+
if os.path.exists(test_file):
764+
os.remove(test_file)
765+
766+
aw_file = f"{aw_dir}aw-all-params.yaml"
767+
if os.path.exists(aw_file):
768+
os.remove(aw_file)
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .rayjob import RayJob
2+
from .status import RayJobDeploymentStatus, CodeflareRayJobStatus, RayJobInfo
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright 2025 IBM, Red Hat
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
This sub-module exists primarily to be used internally by the RayJob object
17+
(in the rayjob sub-module) for pretty-printing job status and details.
18+
"""
19+
20+
from rich.console import Console
21+
from rich.table import Table
22+
from rich.panel import Panel
23+
from typing import Tuple, Optional
24+
25+
from .status import RayJobDeploymentStatus, RayJobInfo
26+
27+
28+
def print_job_status(job_info: RayJobInfo):
29+
"""
30+
Pretty print the job status in a format similar to cluster status.
31+
"""
32+
status_display, header_color = _get_status_display(job_info.status)
33+
34+
# Create main info table
35+
table = _create_info_table(header_color, job_info.name, status_display)
36+
table.add_row(f"[bold]Job ID:[/bold] {job_info.job_id}")
37+
table.add_row(f"[bold]Status:[/bold] {job_info.status.value}")
38+
table.add_row(f"[bold]RayCluster:[/bold] {job_info.cluster_name}")
39+
table.add_row(f"[bold]Namespace:[/bold] {job_info.namespace}")
40+
41+
# Add timing information if available
42+
if job_info.start_time:
43+
table.add_row()
44+
table.add_row(f"[bold]Duration:[/bold] {job_info.duration}")
45+
46+
# Add attempt counts if there are failures
47+
if job_info.failed_attempts > 0:
48+
table.add_row(f"[bold]Failed Attempts:[/bold] {job_info.failed_attempts}")
49+
50+
_print_table_in_panel(table)
51+
52+
53+
def print_no_job_found(job_name: str, namespace: str):
54+
"""
55+
Print a message when no job is found.
56+
"""
57+
# Create table with error message
58+
table = _create_info_table(
59+
"[white on red][bold]Name", job_name, "[bold red]No RayJob found"
60+
)
61+
table.add_row()
62+
table.add_row("Have you run rayjob.submit() yet?")
63+
table.add_row()
64+
table.add_row(f"[bold]Namespace:[/bold] {namespace}")
65+
66+
_print_table_in_panel(table)
67+
68+
69+
def _get_status_display(status: RayJobDeploymentStatus) -> Tuple[str, str]:
70+
"""
71+
Get the display string and header color for a given status.
72+
73+
Returns:
74+
Tuple of (status_display, header_color)
75+
"""
76+
status_mapping = {
77+
RayJobDeploymentStatus.COMPLETE: (
78+
"Complete :white_heavy_check_mark:",
79+
"[white on green][bold]Name",
80+
),
81+
RayJobDeploymentStatus.RUNNING: ("Running :gear:", "[white on blue][bold]Name"),
82+
RayJobDeploymentStatus.FAILED: ("Failed :x:", "[white on red][bold]Name"),
83+
RayJobDeploymentStatus.SUSPENDED: (
84+
"Suspended :pause_button:",
85+
"[white on yellow][bold]Name",
86+
),
87+
}
88+
89+
return status_mapping.get(
90+
status, ("Unknown :question:", "[white on red][bold]Name")
91+
)
92+
93+
94+
def _create_info_table(header_color: str, name: str, status_display: str) -> Table:
95+
"""
96+
Create a standardized info table with header and status.
97+
98+
Returns:
99+
Table with header row, name/status row, and empty separator row
100+
"""
101+
table = Table(box=None, show_header=False)
102+
table.add_row(header_color)
103+
table.add_row("[bold underline]" + name, status_display)
104+
table.add_row() # Empty separator row
105+
return table
106+
107+
108+
def _print_table_in_panel(table: Table):
109+
"""
110+
Print a table wrapped in a consistent panel format.
111+
"""
112+
console = Console()
113+
main_table = Table(
114+
box=None, title="[bold] :package: CodeFlare RayJob Status :package:"
115+
)
116+
main_table.add_row(Panel.fit(table))
117+
console.print(main_table)

0 commit comments

Comments
 (0)