Skip to content

Commit e5e0f86

Browse files
[autorevert] Add event logging for autorevert actions (#6997)
Core changes: * add clickHouse event logging: log detection, restart outcomes, and secondary c onfirmations to `misc.autorevert_events` * define schema for `misc.autorevert_events` Misc: * allow `--dry-run` after the `autorevert-checker` subcommand. # Testing locally: ``` python -m pytorch_auto_revert autorevert-checker inductor --hours 164 --do-restart --do-revert --dry-run ``` patterns were detected, events were logged --------- Co-authored-by: Jean Schmidt <[email protected]>
1 parent f9f933a commit e5e0f86

File tree

4 files changed

+151
-1
lines changed

4 files changed

+151
-1
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/__main__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def main(*args, **kwargs) -> None:
179179
"linux-binary-manywheel",
180180
],
181181
do_restart=True,
182-
do_revert=False,
182+
do_revert=True,
183183
hours=2,
184184
verbose=True,
185185
dry_run=opts.dry_run,
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import logging
2+
from dataclasses import dataclass
3+
from typing import Optional
4+
5+
from .clickhouse_client_helper import CHCliFactory
6+
7+
8+
def log_autorevert_event(
9+
*,
10+
workflow: str,
11+
action: str,
12+
first_failing_sha: str,
13+
previous_sha: str,
14+
failure_rule: str,
15+
job_name_base: str,
16+
second_failing_sha: Optional[str] = None,
17+
dry_run: bool = False,
18+
notes: str = "",
19+
) -> None:
20+
"""Insert a single autorevert event row into ClickHouse.
21+
22+
Uses the misc.autorevert_events table. Best-effort: logs and continues on failure.
23+
"""
24+
try:
25+
client = CHCliFactory().client
26+
columns = [
27+
"workflow",
28+
"action",
29+
"first_failing_sha",
30+
"previous_sha",
31+
"second_failing_sha",
32+
"failure_rule",
33+
"job_name_base",
34+
"dry_run",
35+
"notes",
36+
]
37+
data = [
38+
[
39+
workflow,
40+
action,
41+
first_failing_sha,
42+
previous_sha,
43+
second_failing_sha,
44+
failure_rule,
45+
job_name_base,
46+
1 if dry_run else 0,
47+
notes or "",
48+
]
49+
]
50+
# Specify database explicitly since this table lives in 'misc'
51+
client.insert(
52+
table="autorevert_events",
53+
data=data,
54+
column_names=columns,
55+
database="misc",
56+
)
57+
except Exception as e:
58+
logging.warning(f"Failed to log autorevert event to ClickHouse: {e}")
59+
60+
61+
@dataclass
62+
class AutoRevertEvent:
63+
workflow: str
64+
first_failing_sha: str
65+
previous_sha: str
66+
second_failing_sha: Optional[str]
67+
failure_rule: str
68+
job_name_base: str
69+
dry_run: bool = False
70+
notes: str = ""
71+
72+
def send(self, action: str, notes: Optional[str] = None) -> None:
73+
"""Send this event with the given action; optional notes override."""
74+
log_autorevert_event(
75+
workflow=self.workflow,
76+
action=action,
77+
first_failing_sha=self.first_failing_sha,
78+
previous_sha=self.previous_sha,
79+
second_failing_sha=self.second_failing_sha,
80+
failure_rule=self.failure_rule,
81+
job_name_base=self.job_name_base,
82+
dry_run=self.dry_run,
83+
notes=(notes if notes is not None else self.notes),
84+
)
85+
86+
def send_restart_outcome(
87+
self, *, already_count: int, success_count: int, failure_count: int
88+
) -> None:
89+
"""Consolidated restart outcome sender with minimal branching."""
90+
if self.dry_run or (
91+
already_count > 0 and success_count == 0 and failure_count == 0
92+
):
93+
self.send(
94+
"restart_skipped",
95+
notes=(
96+
"dry_run" if self.dry_run else f"already_restarted={already_count}"
97+
),
98+
)
99+
return
100+
101+
action = "restart_dispatched" if success_count > 0 else "restart_failed"
102+
notes = f"success={success_count}, failures={failure_count}, already={already_count}"
103+
self.send(action, notes=notes)

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/testers/autorevert.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from collections import defaultdict
33

44
from ..autorevert_checker import AutorevertPatternChecker
5+
from ..event_logger import AutoRevertEvent
56
from ..workflow_checker import WorkflowRestartChecker
67

78

@@ -121,6 +122,19 @@ def autorevert_checker(
121122
previous_commit = pattern[
122123
"older_commit"
123124
] # previously successful commit for the matched job
125+
# Prepare base DTO for logging events for this pattern
126+
evt = AutoRevertEvent(
127+
workflow=workflow_name,
128+
first_failing_sha=first_failing,
129+
previous_sha=previous_commit,
130+
second_failing_sha=(pattern["newer_commits"][0]),
131+
failure_rule=pattern["failure_rule"],
132+
job_name_base=pattern.get("job_name_base", ""),
133+
dry_run=dry_run,
134+
)
135+
136+
# Log the detection event
137+
evt.send("detected")
124138
revert_result = revert_checker.is_commit_reverted(first_failing)
125139

126140
if revert_result:
@@ -140,13 +154,17 @@ def autorevert_checker(
140154
# Try to restart workflow if --do-restart flag is set and not already reverted
141155
if do_restart and restart_checker:
142156
# Restart the first failing (older failing) and the previous (successful) commit
157+
already_restarted = []
158+
successes = []
159+
failures = []
143160
for target_commit in (first_failing, previous_commit):
144161
if restart_checker.has_restarted_workflow(
145162
workflow_name, target_commit
146163
):
147164
print(
148165
f" ⟳ ALREADY RESTARTED: {workflow_name} for {target_commit[:8]}"
149166
)
167+
already_restarted.append(target_commit)
150168
continue
151169
if dry_run:
152170
print(
@@ -162,15 +180,26 @@ def autorevert_checker(
162180
f" ✓ RESTARTED: {workflow_name} for {target_commit[:8]}"
163181
)
164182
restarted_commits.append((workflow_name, target_commit))
183+
successes.append(target_commit)
165184
else:
166185
print(
167186
f" ✗ FAILED TO RESTART: {workflow_name} for {target_commit[:8]}"
168187
)
188+
failures.append(target_commit)
189+
190+
# Log restart outcome once per pattern
191+
evt.send_restart_outcome(
192+
already_count=len(already_restarted),
193+
success_count=len(successes),
194+
failure_count=len(failures),
195+
)
169196

170197
# Secondary verification: compare first failing vs previous on restarted runs.
171198
if do_revert:
172199
try:
173200
if checker.confirm_commit_caused_failure_on_restarted(pattern):
201+
# Log secondary confirmation
202+
evt.send("secondary_confirmed")
174203
if dry_run:
175204
print(
176205
f" ⚠ DRY RUN: Would record REVERT for {first_failing[:8]} ({workflow_name})"
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
CREATE TABLE misc.autorevert_events
2+
(
3+
`ts` DateTime DEFAULT now(),
4+
`repo` LowCardinality(String) DEFAULT 'pytorch/pytorch',
5+
`workflow` LowCardinality(String),
6+
`action` Enum8('detected' = 1, 'restart_dispatched' = 2, 'restart_skipped' = 3, 'restart_failed' = 4, 'secondary_confirmed' = 5),
7+
`first_failing_sha` FixedString(40),
8+
`previous_sha` FixedString(40),
9+
`second_failing_sha` Nullable(FixedString(40)) DEFAULT NULL,
10+
`failure_rule` LowCardinality(String),
11+
`job_name_base` String,
12+
`dry_run` UInt8 DEFAULT 0,
13+
`notes` String DEFAULT '',
14+
`version` UInt64 MATERIALIZED toUInt64(toUnixTimestamp(ts))
15+
)
16+
ENGINE = SharedReplacingMergeTree('/clickhouse/tables/{uuid}/{shard}', '{replica}', version)
17+
ORDER BY (repo, workflow, action, first_failing_sha, dry_run)
18+
SETTINGS index_granularity = 8192

0 commit comments

Comments
 (0)