Skip to content
This repository was archived by the owner on Jan 6, 2023. It is now read-only.

Commit 50dc44a

Browse files
aivanoufacebook-github-bot
authored andcommitted
Add '--all' parameter to the torchelastic status
Summary: The diff introduces AppState filter using bitmasks. The diff changes 'tsm status' API to filter workers based on the AppState. There are two modes: '--all' and default. The default mode shows workers which AppState belongs to all values except AppState.CANCELED. The '--all' parameter shows workers with all AppStates. Example of usage: tsm status mast://default/tw_job_id Reviewed By: kiukchung Differential Revision: D26391132 fbshipit-source-id: 0ab70cf5d2ec75db0d2f9dfc6c6a3b475b5fcc4b
1 parent c35567f commit 50dc44a

File tree

2 files changed

+29
-27
lines changed

2 files changed

+29
-27
lines changed

torchelastic/tsm/driver/api.py

Lines changed: 27 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -28,24 +28,22 @@
2828
)
2929

3030
_APP_STATUS_FORMAT_TEMPLATE = """
31-
state: ${state}
32-
num_restarts: ${num_restarts}
33-
msg: ${msg}
31+
State: ${state} ; Num Restarts: ${num_restarts}
32+
Msg: ${msg}
3433
Replicas: ${replicas}
3534
"""
3635

36+
3737
_ROLE_REPLICA_FORMAT_TEMPLATE = """
3838
- Role: [${role}]:
3939
${replicas}
4040
"""
4141

42-
4342
_REPLICA_FORMAT_TEMPLATE = """
4443
- [${role}:${replica_id}]
45-
timestamp: ${timestamp}
46-
exit_code: ${exit_code}
47-
state: ${state}
48-
error_msg: ${error_msg}
44+
Timestamp: ${timestamp}; Exit Code: ${exit_code}
45+
State: ${state}
46+
Error Message: ${error_msg}
4947
"""
5048

5149

@@ -367,7 +365,7 @@ def of(self, *roles: Role) -> "Application":
367365
return self
368366

369367

370-
class AppState(str, Enum):
368+
class AppState(int, Enum):
371369
"""
372370
State of the application. An application starts from an initial
373371
``UNSUBMITTED`` state and moves through ``SUBMITTED``, ``PENDING``,
@@ -390,13 +388,16 @@ class AppState(str, Enum):
390388
7. CANCELLED - app was cancelled before completing
391389
"""
392390

393-
UNSUBMITTED = 0
394-
SUBMITTED = 1
395-
PENDING = 2
396-
RUNNING = 3
397-
SUCCEEDED = 4
398-
FAILED = 5
399-
CANCELLED = 6
391+
UNSUBMITTED = 2 ** 0
392+
SUBMITTED = 2 ** 1
393+
PENDING = 2 ** 2
394+
RUNNING = 2 ** 3
395+
SUCCEEDED = 2 ** 4
396+
FAILED = 2 ** 5
397+
CANCELLED = 2 ** 6
398+
399+
def __str__(self) -> str:
400+
return self.name
400401

401402

402403
_TERMINAL_STATES = [AppState.SUCCEEDED, AppState.FAILED, AppState.CANCELLED]
@@ -479,26 +480,28 @@ def __repr__(self):
479480
return json.dumps(app_status_dict, indent=2)
480481

481482
def _get_role_replicas(
482-
self, state_filter: Optional[AppState] = None
483+
self, state_mask_filter: int = 0xFF
483484
) -> Dict[str, List[RoleReplicaStatus]]:
484-
if not state_filter:
485-
return self.replicas
486485
filterred_replicas = {}
487486
for role, role_replicas in self.replicas.items():
488487
filterred_replicas[role] = [
489-
replica for replica in role_replicas if replica.state == state_filter
488+
replica
489+
for replica in role_replicas
490+
if replica.state.value | state_mask_filter == state_mask_filter
490491
]
491492
return filterred_replicas
492493

493-
def get_formatted_str(self, state_filter: Optional[AppState] = None) -> str:
494+
def get_formatted_str(self, state_mask_filter: int = 0xFF) -> str:
494495
"""
495496
Return a human readable representation of the AppStatus.
496497
"""
497498
role_replicas = ""
498-
filterred_replicas = self._get_role_replicas(state_filter)
499-
for role, role_replics in filterred_replicas.items():
499+
filterred_replicas = self._get_role_replicas(state_mask_filter)
500+
for role, filterred_role_replicas in filterred_replicas.items():
501+
if len(filterred_role_replicas) == 0:
502+
continue
500503
replicas_str = "".join(
501-
replica.get_formatted_str() for replica in role_replics
504+
replica.get_formatted_str() for replica in filterred_role_replicas
502505
)
503506
role_replicas += Template(_ROLE_REPLICA_FORMAT_TEMPLATE).substitute(
504507
role=role, replicas=replicas_str

torchelastic/tsm/driver/test/api_test.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,8 @@ def test_get_formatted_string(self):
9090

9191
formatted_str = status.get_formatted_str()
9292
expected_str = (
93-
"\nstate: AppState.FAILED\nnum_restarts: 0\nmsg: "
94-
"\nReplicas: \n- Role: [worker1]:\n\n- [worker1:id1]\n "
95-
"timestamp: None\n exit_code: 3\n state: AppState.FAILED\n error_msg: None\n\n\n"
93+
"\nState: FAILED ; Num Restarts: 0\nMsg: \nReplicas: \n- Role: [worker1]:\n\n- [worker1:id1]\n "
94+
"Timestamp: None; Exit Code: 3\n State: FAILED\n Error Message: None\n\n\n"
9695
)
9796
self.assertEqual(expected_str, formatted_str)
9897

0 commit comments

Comments
 (0)