From c199db0432e2c285f97dbc63e3a45c14b5370f5b Mon Sep 17 00:00:00 2001 From: Francisco Massa Date: Sat, 5 Jun 2021 14:09:20 +0200 Subject: [PATCH 1/4] Add scripts for facilitating generating release notes --- scripts/release_notes/common.py | 238 ++++++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 scripts/release_notes/common.py diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py new file mode 100644 index 00000000000..0712367856c --- /dev/null +++ b/scripts/release_notes/common.py @@ -0,0 +1,238 @@ +from collections import namedtuple +from os.path import expanduser +import locale +import subprocess +import re +import requests +import os +import json + +categories = [ + 'Uncategorized', + 'distributed', + 'mobile', + 'jit', + 'visualization', + 'onnx', + 'caffe2', + 'quantization', + 'amd', + 'benchmark', + 'profiler', + 'dispatcher', + 'releng', + 'fx', + 'code_coverage', + 'vulkan', + 'skip', + 'cpp_frontend', + 'python_frontend', + 'complex_frontend', + 'vmap_frontend', + 'autograd_frontend', + 'build_frontend', + 'memory_format_frontend', + 'foreach_frontend', +] + +topics = [ + 'bc_breaking', + 'deprecations', + 'new_features', + 'improvements', + 'bug_fixes', + 'performance', + 'docs', + 'devs', + 'Untopiced', +] + + +Features = namedtuple('Features', [ + 'title', + 'body', + 'pr_number', + 'files_changed', + 'labels', +]) + + +def dict_to_features(dct): + return Features( + title=dct['title'], + body=dct['body'], + pr_number=dct['pr_number'], + files_changed=dct['files_changed'], + labels=dct['labels']) + + +def features_to_dict(features): + return dict(features._asdict()) + + +def run(command): + """Returns (return-code, stdout, stderr)""" + p = subprocess.Popen(command, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, shell=True) + output, err = p.communicate() + rc = p.returncode + enc = locale.getpreferredencoding() + output = output.decode(enc) + err = err.decode(enc) + return rc, output.strip(), err.strip() + + +def commit_body(commit_hash): + cmd = f'git log -n 1 --pretty=format:%b {commit_hash}' + ret, out, err = run(cmd) + return out if ret == 0 else None + + +def commit_title(commit_hash): + cmd = f'git log -n 1 --pretty=format:%s {commit_hash}' + ret, out, err = run(cmd) + return out if ret == 0 else None + + +def commit_files_changed(commit_hash): + cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}' + ret, out, err = run(cmd) + return out.split('\n') if ret == 0 else None + + +def parse_pr_number(body, commit_hash, title): + regex = r'(#[0-9]+)' + matches = re.findall(regex, title) + if len(matches) == 0: + if 'revert' not in title.lower() and 'updating submodules' not in title.lower(): + print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR') + return None + if len(matches) > 1: + print(f'[{commit_hash}: {title}] Got two PR numbers, using the last one') + return matches[-1][1:] + return matches[0][1:] + + +def get_ghstack_token(): + pattern = 'github_oauth = (.*)' + with open(expanduser('~/.ghstackrc'), 'r+') as f: + config = f.read() + matches = re.findall(pattern, config) + if len(matches) == 0: + raise RuntimeError("Can't find a github oauth token") + return matches[0] + +token = get_ghstack_token() +headers = {"Authorization": f"token {token}"} + +def run_query(query): + request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers) + if request.status_code == 200: + return request.json() + else: + raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query)) + + +def gh_labels(pr_number): + query = f""" + {{ + repository(owner: "pytorch", name: "vision") {{ + pullRequest(number: {pr_number}) {{ + labels(first: 10) {{ + edges {{ + node {{ + name + }} + }} + }} + }} + }} + }} + """ + query = run_query(query) + edges = query['data']['repository']['pullRequest']['labels']['edges'] + return [edge['node']['name'] for edge in edges] + + +def get_features(commit_hash, return_dict=False): + title, body, files_changed = ( + commit_title(commit_hash), + commit_body(commit_hash), + commit_files_changed(commit_hash)) + pr_number = parse_pr_number(body, commit_hash, title) + labels = [] + if pr_number is not None: + labels = gh_labels(pr_number) + result = Features(title, body, pr_number, files_changed, labels) + if return_dict: + return features_to_dict(result) + return result + +class CommitDataCache: + def __init__(self, path='results/data.json'): + self.path = path + self.data = {} + if os.path.exists(path): + self.data = self.read_from_disk() + + def get(self, commit): + if commit not in self.data.keys(): + # Fetch and cache the data + self.data[commit] = get_features(commit) + self.write_to_disk() + return self.data[commit] + + def read_from_disk(self): + with open(self.path, 'r') as f: + data = json.load(f) + data = {commit: dict_to_features(dct) + for commit, dct in data.items()} + return data + + def write_to_disk(self): + data = {commit: features._asdict() for commit, features in self.data.items()} + with open(self.path, 'w') as f: + json.dump(data, f) + + +def get_commits_between(base_version, new_version): + cmd = f'git merge-base {base_version} {new_version}' + rc, merge_base, _ = run(cmd) + assert rc == 0 + + # Returns a list of something like + # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555) + cmd = f'git log --reverse --oneline {merge_base}..{new_version}' + rc, commits, _ = run(cmd) + assert rc == 0 + + log_lines = commits.split('\n') + hashes, titles = zip(*[log_line.split(' ', 1) for log_line in log_lines]) + return hashes, titles + + +def convert_to_dataframes(feature_list): + import pandas as pd + df = pd.DataFrame.from_records(feature_list, columns=Features._fields) + return df + + +def main(base_version, new_version): + hashes, titles = get_commits_between(base_version, new_version) + + cdc = CommitDataCache('data.json') + for idx, commit in enumerate(hashes): + if idx % 10 == 0: + print(f"{idx} / {len(hashes)}") + cdc.get(commit) + + return cdc + + +if __name__ == "__main__": + #d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8') + # d = get_features('f5843099d895e72c27ffa9d29cc91dd8df7f3832') + # print(d) + # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") + cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") + from IPython import embed; embed() From 237c749729481822796895cd77b5221a91764029 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Oct 2021 18:12:30 +0100 Subject: [PATCH 2/4] remove unused lists, and added instructions, applied balck --- scripts/release_notes/common.py | 150 +++++++++++++------------------- 1 file changed, 62 insertions(+), 88 deletions(-) diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py index 0712367856c..603fc7818cb 100644 --- a/scripts/release_notes/common.py +++ b/scripts/release_notes/common.py @@ -1,69 +1,34 @@ -from collections import namedtuple -from os.path import expanduser +import json import locale -import subprocess +import os import re +import subprocess +from collections import namedtuple +from os.path import expanduser + import requests -import os -import json -categories = [ - 'Uncategorized', - 'distributed', - 'mobile', - 'jit', - 'visualization', - 'onnx', - 'caffe2', - 'quantization', - 'amd', - 'benchmark', - 'profiler', - 'dispatcher', - 'releng', - 'fx', - 'code_coverage', - 'vulkan', - 'skip', - 'cpp_frontend', - 'python_frontend', - 'complex_frontend', - 'vmap_frontend', - 'autograd_frontend', - 'build_frontend', - 'memory_format_frontend', - 'foreach_frontend', -] - -topics = [ - 'bc_breaking', - 'deprecations', - 'new_features', - 'improvements', - 'bug_fixes', - 'performance', - 'docs', - 'devs', - 'Untopiced', -] - - -Features = namedtuple('Features', [ - 'title', - 'body', - 'pr_number', - 'files_changed', - 'labels', -]) + +Features = namedtuple( + "Features", + [ + "title", + "body", + "pr_number", + "files_changed", + "labels", + ], +) def dict_to_features(dct): return Features( - title=dct['title'], - body=dct['body'], - pr_number=dct['pr_number'], - files_changed=dct['files_changed'], - labels=dct['labels']) + title=dct["title"], + body=dct["body"], + pr_number=dct["pr_number"], + files_changed=dct["files_changed"], + labels=dct["labels"], + ) def features_to_dict(features): @@ -72,8 +37,7 @@ def features_to_dict(features): def run(command): """Returns (return-code, stdout, stderr)""" - p = subprocess.Popen(command, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, shell=True) + p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) output, err = p.communicate() rc = p.returncode enc = locale.getpreferredencoding() @@ -83,50 +47,52 @@ def run(command): def commit_body(commit_hash): - cmd = f'git log -n 1 --pretty=format:%b {commit_hash}' + cmd = f"git log -n 1 --pretty=format:%b {commit_hash}" ret, out, err = run(cmd) return out if ret == 0 else None def commit_title(commit_hash): - cmd = f'git log -n 1 --pretty=format:%s {commit_hash}' + cmd = f"git log -n 1 --pretty=format:%s {commit_hash}" ret, out, err = run(cmd) return out if ret == 0 else None def commit_files_changed(commit_hash): - cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}' + cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}" ret, out, err = run(cmd) - return out.split('\n') if ret == 0 else None + return out.split("\n") if ret == 0 else None def parse_pr_number(body, commit_hash, title): - regex = r'(#[0-9]+)' + regex = r"(#[0-9]+)" matches = re.findall(regex, title) if len(matches) == 0: - if 'revert' not in title.lower() and 'updating submodules' not in title.lower(): - print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR') + if "revert" not in title.lower() and "updating submodules" not in title.lower(): + print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR") return None if len(matches) > 1: - print(f'[{commit_hash}: {title}] Got two PR numbers, using the last one') + print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one") return matches[-1][1:] return matches[0][1:] def get_ghstack_token(): - pattern = 'github_oauth = (.*)' - with open(expanduser('~/.ghstackrc'), 'r+') as f: + pattern = "github_oauth = (.*)" + with open(expanduser("~/.ghstackrc"), "r+") as f: config = f.read() matches = re.findall(pattern, config) if len(matches) == 0: raise RuntimeError("Can't find a github oauth token") return matches[0] + token = get_ghstack_token() headers = {"Authorization": f"token {token}"} + def run_query(query): - request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers) + request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers) if request.status_code == 200: return request.json() else: @@ -150,15 +116,16 @@ def gh_labels(pr_number): }} """ query = run_query(query) - edges = query['data']['repository']['pullRequest']['labels']['edges'] - return [edge['node']['name'] for edge in edges] + edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"] + return [edge["node"]["name"] for edge in edges] def get_features(commit_hash, return_dict=False): title, body, files_changed = ( commit_title(commit_hash), commit_body(commit_hash), - commit_files_changed(commit_hash)) + commit_files_changed(commit_hash), + ) pr_number = parse_pr_number(body, commit_hash, title) labels = [] if pr_number is not None: @@ -168,8 +135,9 @@ def get_features(commit_hash, return_dict=False): return features_to_dict(result) return result + class CommitDataCache: - def __init__(self, path='results/data.json'): + def __init__(self, path="results/data.json"): self.path = path self.data = {} if os.path.exists(path): @@ -183,44 +151,44 @@ def get(self, commit): return self.data[commit] def read_from_disk(self): - with open(self.path, 'r') as f: + with open(self.path, "r") as f: data = json.load(f) - data = {commit: dict_to_features(dct) - for commit, dct in data.items()} + data = {commit: dict_to_features(dct) for commit, dct in data.items()} return data def write_to_disk(self): data = {commit: features._asdict() for commit, features in self.data.items()} - with open(self.path, 'w') as f: + with open(self.path, "w") as f: json.dump(data, f) def get_commits_between(base_version, new_version): - cmd = f'git merge-base {base_version} {new_version}' + cmd = f"git merge-base {base_version} {new_version}" rc, merge_base, _ = run(cmd) assert rc == 0 # Returns a list of something like # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555) - cmd = f'git log --reverse --oneline {merge_base}..{new_version}' + cmd = f"git log --reverse --oneline {merge_base}..{new_version}" rc, commits, _ = run(cmd) assert rc == 0 - log_lines = commits.split('\n') - hashes, titles = zip(*[log_line.split(' ', 1) for log_line in log_lines]) + log_lines = commits.split("\n") + hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines]) return hashes, titles def convert_to_dataframes(feature_list): import pandas as pd + df = pd.DataFrame.from_records(feature_list, columns=Features._fields) return df def main(base_version, new_version): hashes, titles = get_commits_between(base_version, new_version) - - cdc = CommitDataCache('data.json') + + cdc = CommitDataCache("data.json") for idx, commit in enumerate(hashes): if idx % 10 == 0: print(f"{idx} / {len(hashes)}") @@ -230,9 +198,15 @@ def main(base_version, new_version): if __name__ == "__main__": - #d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8') - # d = get_features('f5843099d895e72c27ffa9d29cc91dd8df7f3832') + # d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8') # print(d) # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") + + # Usage: change the tags below accordingly to the current release, then save the json with + # cdc.write_to_disk(). + # Then you can rely on https://nbviewer.org/gist/NicolasHug/248a72aa086fd874936f16eb56240226 + # to open the json and generate the release notes semi-automatically. cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") - from IPython import embed; embed() + from IPython import embed + + embed() From afbeb47c44c5679561dc7b17441c1637174778af Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Oct 2021 18:13:31 +0100 Subject: [PATCH 3/4] renamed file --- scripts/release_notes/{common.py => retrieve_prs_data.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename scripts/release_notes/{common.py => retrieve_prs_data.py} (100%) diff --git a/scripts/release_notes/common.py b/scripts/release_notes/retrieve_prs_data.py similarity index 100% rename from scripts/release_notes/common.py rename to scripts/release_notes/retrieve_prs_data.py From c4f962df5621e87a970dd00870cbbf9498678d67 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 14 Oct 2021 18:48:01 +0100 Subject: [PATCH 4/4] Added pandas script directly --- scripts/release_notes/classify_prs.py | 121 +++++++++++++++++++++ scripts/release_notes/retrieve_prs_data.py | 2 +- 2 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 scripts/release_notes/classify_prs.py diff --git a/scripts/release_notes/classify_prs.py b/scripts/release_notes/classify_prs.py new file mode 100644 index 00000000000..580e93bfe8b --- /dev/null +++ b/scripts/release_notes/classify_prs.py @@ -0,0 +1,121 @@ +# In[1]: + + +import pandas as pd + + +# In[2]: + + +df = pd.read_json("10.0_to_11.0-rc2.json").T +df.tail() + + +# In[3]: + + +all_labels = set(lbl for labels in df["labels"] for lbl in labels) +all_labels + + +# In[4]: + + +# Add one column per label +for label in all_labels: + df[label] = df["labels"].apply(lambda labels_list: label in labels_list) +df.head() + + +# In[5]: + + +# Add a clean "module" column. It contains tuples since PRs can have more than one module. +# Maybe we should include "topics" in that column as well? + +all_modules = { # mapping: full name -> clean name + label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module") +} + +# We use an ugly loop, but whatever ¯\_(ツ)_/¯ +df["module"] = [[] for _ in range(len(df))] +for i, row in df.iterrows(): + for full_name, clean_name in all_modules.items(): + if full_name in row["labels"]: + row["module"].append(clean_name) +df["module"] = df.module.apply(tuple) +df.head() + + +# In[6]: + + +mod_df = df.set_index("module").sort_index() +mod_df.tail() + + +# In[7]: + + +# All improvement PRs +mod_df[mod_df["enhancement"]].head() + + +# In[8]: + + +# improvement f module +# note: don't filter module name on the index as the index contain tuples with non-exclusive values +# Use the boolean column instead +mod_df[mod_df["enhancement"] & mod_df["module: transforms"]] + + +# In[9]: + + +def format_prs(mod_df): + out = [] + for idx, row in mod_df.iterrows(): + modules = idx + # Put "documentation" and "tests" first for sorting to be dece + for last_module in ("documentation", "tests"): + if last_module in modules: + modules = [m for m in modules if m != last_module] + [last_module] + + module = f"[{', '.join(modules)}]" + module = module.replace("referencescripts", "reference scripts") + module = module.replace("code", "reference scripts") + out.append(f"{module} {row['title']}") + + return "\n".join(out) + + +# In[10]: + + +included_prs = pd.DataFrame() + +# If labels are accurate, this shouhld generate most of the release notes already +# We keep track of the included PRs to figure out which ones are missing +for section_title, module_idx in ( + ("Backward-incompatible changes", "bc-breaking"), + ("Deprecations", "deprecation"), + ("New Features", "new feature"), + ("Improvements", "enhancement"), + ("Bug Fixes", "bug"), + ("Code Quality", "code quality"), +): + print(f"## {section_title}") + print() + tmp_df = mod_df[mod_df[module_idx]] + included_prs = pd.concat([included_prs, tmp_df]) + print(format_prs(tmp_df)) + print() + + +# In[11]: + + +# Missing PRs are these ones... classify them manually +missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False) +print(format_prs(missing_prs)) diff --git a/scripts/release_notes/retrieve_prs_data.py b/scripts/release_notes/retrieve_prs_data.py index 603fc7818cb..90cb4cda07e 100644 --- a/scripts/release_notes/retrieve_prs_data.py +++ b/scripts/release_notes/retrieve_prs_data.py @@ -204,7 +204,7 @@ def main(base_version, new_version): # Usage: change the tags below accordingly to the current release, then save the json with # cdc.write_to_disk(). - # Then you can rely on https://nbviewer.org/gist/NicolasHug/248a72aa086fd874936f16eb56240226 + # Then you can use classify_prs.py (as a notebook) # to open the json and generate the release notes semi-automatically. cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15") from IPython import embed