Skip to content

Commit 3d7244b

Browse files
fmassaNicolasHug
andauthored
Add scripts for facilitating generating release notes (#3973)
* Add scripts for facilitating generating release notes * remove unused lists, and added instructions, applied balck * renamed file * Added pandas script directly Co-authored-by: Nicolas Hug <[email protected]> Co-authored-by: Nicolas Hug <[email protected]>
1 parent 6d459c7 commit 3d7244b

File tree

2 files changed

+333
-0
lines changed

2 files changed

+333
-0
lines changed

scripts/release_notes/classify_prs.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# In[1]:
2+
3+
4+
import pandas as pd
5+
6+
7+
# In[2]:
8+
9+
10+
df = pd.read_json("10.0_to_11.0-rc2.json").T
11+
df.tail()
12+
13+
14+
# In[3]:
15+
16+
17+
all_labels = set(lbl for labels in df["labels"] for lbl in labels)
18+
all_labels
19+
20+
21+
# In[4]:
22+
23+
24+
# Add one column per label
25+
for label in all_labels:
26+
df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
27+
df.head()
28+
29+
30+
# In[5]:
31+
32+
33+
# Add a clean "module" column. It contains tuples since PRs can have more than one module.
34+
# Maybe we should include "topics" in that column as well?
35+
36+
all_modules = { # mapping: full name -> clean name
37+
label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
38+
}
39+
40+
# We use an ugly loop, but whatever ¯\_(ツ)_/¯
41+
df["module"] = [[] for _ in range(len(df))]
42+
for i, row in df.iterrows():
43+
for full_name, clean_name in all_modules.items():
44+
if full_name in row["labels"]:
45+
row["module"].append(clean_name)
46+
df["module"] = df.module.apply(tuple)
47+
df.head()
48+
49+
50+
# In[6]:
51+
52+
53+
mod_df = df.set_index("module").sort_index()
54+
mod_df.tail()
55+
56+
57+
# In[7]:
58+
59+
60+
# All improvement PRs
61+
mod_df[mod_df["enhancement"]].head()
62+
63+
64+
# In[8]:
65+
66+
67+
# improvement f module
68+
# note: don't filter module name on the index as the index contain tuples with non-exclusive values
69+
# Use the boolean column instead
70+
mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]
71+
72+
73+
# In[9]:
74+
75+
76+
def format_prs(mod_df):
77+
out = []
78+
for idx, row in mod_df.iterrows():
79+
modules = idx
80+
# Put "documentation" and "tests" first for sorting to be dece
81+
for last_module in ("documentation", "tests"):
82+
if last_module in modules:
83+
modules = [m for m in modules if m != last_module] + [last_module]
84+
85+
module = f"[{', '.join(modules)}]"
86+
module = module.replace("referencescripts", "reference scripts")
87+
module = module.replace("code", "reference scripts")
88+
out.append(f"{module} {row['title']}")
89+
90+
return "\n".join(out)
91+
92+
93+
# In[10]:
94+
95+
96+
included_prs = pd.DataFrame()
97+
98+
# If labels are accurate, this shouhld generate most of the release notes already
99+
# We keep track of the included PRs to figure out which ones are missing
100+
for section_title, module_idx in (
101+
("Backward-incompatible changes", "bc-breaking"),
102+
("Deprecations", "deprecation"),
103+
("New Features", "new feature"),
104+
("Improvements", "enhancement"),
105+
("Bug Fixes", "bug"),
106+
("Code Quality", "code quality"),
107+
):
108+
print(f"## {section_title}")
109+
print()
110+
tmp_df = mod_df[mod_df[module_idx]]
111+
included_prs = pd.concat([included_prs, tmp_df])
112+
print(format_prs(tmp_df))
113+
print()
114+
115+
116+
# In[11]:
117+
118+
119+
# Missing PRs are these ones... classify them manually
120+
missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
121+
print(format_prs(missing_prs))
Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
import json
2+
import locale
3+
import os
4+
import re
5+
import subprocess
6+
from collections import namedtuple
7+
from os.path import expanduser
8+
9+
import requests
10+
11+
12+
Features = namedtuple(
13+
"Features",
14+
[
15+
"title",
16+
"body",
17+
"pr_number",
18+
"files_changed",
19+
"labels",
20+
],
21+
)
22+
23+
24+
def dict_to_features(dct):
25+
return Features(
26+
title=dct["title"],
27+
body=dct["body"],
28+
pr_number=dct["pr_number"],
29+
files_changed=dct["files_changed"],
30+
labels=dct["labels"],
31+
)
32+
33+
34+
def features_to_dict(features):
35+
return dict(features._asdict())
36+
37+
38+
def run(command):
39+
"""Returns (return-code, stdout, stderr)"""
40+
p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
41+
output, err = p.communicate()
42+
rc = p.returncode
43+
enc = locale.getpreferredencoding()
44+
output = output.decode(enc)
45+
err = err.decode(enc)
46+
return rc, output.strip(), err.strip()
47+
48+
49+
def commit_body(commit_hash):
50+
cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
51+
ret, out, err = run(cmd)
52+
return out if ret == 0 else None
53+
54+
55+
def commit_title(commit_hash):
56+
cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
57+
ret, out, err = run(cmd)
58+
return out if ret == 0 else None
59+
60+
61+
def commit_files_changed(commit_hash):
62+
cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
63+
ret, out, err = run(cmd)
64+
return out.split("\n") if ret == 0 else None
65+
66+
67+
def parse_pr_number(body, commit_hash, title):
68+
regex = r"(#[0-9]+)"
69+
matches = re.findall(regex, title)
70+
if len(matches) == 0:
71+
if "revert" not in title.lower() and "updating submodules" not in title.lower():
72+
print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
73+
return None
74+
if len(matches) > 1:
75+
print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one")
76+
return matches[-1][1:]
77+
return matches[0][1:]
78+
79+
80+
def get_ghstack_token():
81+
pattern = "github_oauth = (.*)"
82+
with open(expanduser("~/.ghstackrc"), "r+") as f:
83+
config = f.read()
84+
matches = re.findall(pattern, config)
85+
if len(matches) == 0:
86+
raise RuntimeError("Can't find a github oauth token")
87+
return matches[0]
88+
89+
90+
token = get_ghstack_token()
91+
headers = {"Authorization": f"token {token}"}
92+
93+
94+
def run_query(query):
95+
request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers)
96+
if request.status_code == 200:
97+
return request.json()
98+
else:
99+
raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
100+
101+
102+
def gh_labels(pr_number):
103+
query = f"""
104+
{{
105+
repository(owner: "pytorch", name: "vision") {{
106+
pullRequest(number: {pr_number}) {{
107+
labels(first: 10) {{
108+
edges {{
109+
node {{
110+
name
111+
}}
112+
}}
113+
}}
114+
}}
115+
}}
116+
}}
117+
"""
118+
query = run_query(query)
119+
edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
120+
return [edge["node"]["name"] for edge in edges]
121+
122+
123+
def get_features(commit_hash, return_dict=False):
124+
title, body, files_changed = (
125+
commit_title(commit_hash),
126+
commit_body(commit_hash),
127+
commit_files_changed(commit_hash),
128+
)
129+
pr_number = parse_pr_number(body, commit_hash, title)
130+
labels = []
131+
if pr_number is not None:
132+
labels = gh_labels(pr_number)
133+
result = Features(title, body, pr_number, files_changed, labels)
134+
if return_dict:
135+
return features_to_dict(result)
136+
return result
137+
138+
139+
class CommitDataCache:
140+
def __init__(self, path="results/data.json"):
141+
self.path = path
142+
self.data = {}
143+
if os.path.exists(path):
144+
self.data = self.read_from_disk()
145+
146+
def get(self, commit):
147+
if commit not in self.data.keys():
148+
# Fetch and cache the data
149+
self.data[commit] = get_features(commit)
150+
self.write_to_disk()
151+
return self.data[commit]
152+
153+
def read_from_disk(self):
154+
with open(self.path, "r") as f:
155+
data = json.load(f)
156+
data = {commit: dict_to_features(dct) for commit, dct in data.items()}
157+
return data
158+
159+
def write_to_disk(self):
160+
data = {commit: features._asdict() for commit, features in self.data.items()}
161+
with open(self.path, "w") as f:
162+
json.dump(data, f)
163+
164+
165+
def get_commits_between(base_version, new_version):
166+
cmd = f"git merge-base {base_version} {new_version}"
167+
rc, merge_base, _ = run(cmd)
168+
assert rc == 0
169+
170+
# Returns a list of something like
171+
# b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
172+
cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
173+
rc, commits, _ = run(cmd)
174+
assert rc == 0
175+
176+
log_lines = commits.split("\n")
177+
hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
178+
return hashes, titles
179+
180+
181+
def convert_to_dataframes(feature_list):
182+
import pandas as pd
183+
184+
df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
185+
return df
186+
187+
188+
def main(base_version, new_version):
189+
hashes, titles = get_commits_between(base_version, new_version)
190+
191+
cdc = CommitDataCache("data.json")
192+
for idx, commit in enumerate(hashes):
193+
if idx % 10 == 0:
194+
print(f"{idx} / {len(hashes)}")
195+
cdc.get(commit)
196+
197+
return cdc
198+
199+
200+
if __name__ == "__main__":
201+
# d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
202+
# print(d)
203+
# hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
204+
205+
# Usage: change the tags below accordingly to the current release, then save the json with
206+
# cdc.write_to_disk().
207+
# Then you can use classify_prs.py (as a notebook)
208+
# to open the json and generate the release notes semi-automatically.
209+
cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
210+
from IPython import embed
211+
212+
embed()

0 commit comments

Comments
 (0)