Skip to content
This repository was archived by the owner on Jan 6, 2023. It is now read-only.

Commit c35567f

Browse files
aivanoufacebook-github-bot
authored andcommitted
Resolve bug with agent_restarts where it was equal to max_restarts
Summary: Resolve bug with agent_restarts where it was equal to max_restarts Make sure that different schedulers start retry attempt with the same initial value Reviewed By: kiukchung Differential Revision: D26431772 fbshipit-source-id: 51cd4d59a680c505d2bb0863dc0510df17e8a472
1 parent cca6f0e commit c35567f

File tree

2 files changed

+4
-2
lines changed

2 files changed

+4
-2
lines changed

torchelastic/agent/server/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -745,7 +745,7 @@ def _construct_event(
745745
"rdzv_backend": spec.rdzv_handler.get_backend(),
746746
"raw_error": raw_error,
747747
"metadata": md_str,
748-
"agent_restarts": spec.max_restarts,
748+
"agent_restarts": spec.max_restarts - self._remaining_restarts,
749749
}
750750
return Event(
751751
f"torchelastic.worker.status.{state}", source=source, metadata=metadata

torchelastic/agent/server/test/api_test.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,9 @@ def test_get_agent_status_event(self):
541541
self.assertEqual(spec.role, actual_event.metadata["role"])
542542

543543
def test_get_worker_status_event(self):
544-
spec = self._get_worker_spec(max_restarts=1)
544+
spec = self._get_worker_spec(max_restarts=4)
545545
agent = TestAgent(spec)
546+
agent._remaining_restarts = spec.max_restarts - 2
546547
actual_event = agent._construct_event(
547548
state=WorkerState.SUCCEEDED.value,
548549
source="WORKER",
@@ -552,3 +553,4 @@ def test_get_worker_status_event(self):
552553
self.assertEqual("etcd", actual_event.metadata["rdzv_backend"])
553554
self.assertEqual(WorkerState.SUCCEEDED.value, actual_event.metadata["state"])
554555
self.assertEqual(spec.role, actual_event.metadata["role"])
556+
self.assertEqual(2, actual_event.metadata["agent_restarts"])

0 commit comments

Comments
 (0)