Add scripts for facilitating generating release notes (#3973)

fmassa · NicolasHug · web-flow · commit 3d7244b52803 · 2021-10-15T09:14:39.000+01:00
* Add scripts for facilitating generating release notes

* remove unused lists, and added instructions, applied balck

* renamed file

* Added pandas script directly

Co-authored-by: Nicolas Hug &lt;nicolashug@fb.com&gt;
Co-authored-by: Nicolas Hug &lt;contact@nicolas-hug.com&gt;
diff --git a/scripts/release_notes/classify_prs.py b/scripts/release_notes/classify_prs.py
@@ -0,0 +1,121 @@
+# In[1]:
+
+
+import pandas as pd
+
+
+# In[2]:
+
+
+df = pd.read_json("10.0_to_11.0-rc2.json").T
+df.tail()
+
+
+# In[3]:
+
+
+all_labels = set(lbl for labels in df["labels"] for lbl in labels)
+all_labels
+
+
+# In[4]:
+
+
+# Add one column per label
+for label in all_labels:
+    df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
+df.head()
+
+
+# In[5]:
+
+
+# Add a clean "module" column. It contains tuples since PRs can have more than one module.
+# Maybe we should include "topics" in that column as well?
+
+all_modules = {  # mapping: full name -> clean name
+    label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
+}
+
+# We use an ugly loop, but whatever ¯\_(ツ)_/¯
+df["module"] = [[] for _ in range(len(df))]
+for i, row in df.iterrows():
+    for full_name, clean_name in all_modules.items():
+        if full_name in row["labels"]:
+            row["module"].append(clean_name)
+df["module"] = df.module.apply(tuple)
+df.head()
+
+
+# In[6]:
+
+
+mod_df = df.set_index("module").sort_index()
+mod_df.tail()
+
+
+# In[7]:
+
+
+# All improvement PRs
+mod_df[mod_df["enhancement"]].head()
+
+
+# In[8]:
+
+
+# improvement f module
+# note: don't filter module name on the index as the index contain tuples with non-exclusive values
+# Use the boolean column instead
+mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]
+
+
+# In[9]:
+
+
+def format_prs(mod_df):
+    out = []
+    for idx, row in mod_df.iterrows():
+        modules = idx
+        # Put "documentation" and "tests" first for sorting to be dece
+        for last_module in ("documentation", "tests"):
+            if last_module in modules:
+                modules = [m for m in modules if m != last_module] + [last_module]
+
+        module = f"[{', '.join(modules)}]"
+        module = module.replace("referencescripts", "reference scripts")
+        module = module.replace("code", "reference scripts")
+        out.append(f"{module} {row['title']}")
+
+    return "\n".join(out)
+
+
+# In[10]:
+
+
+included_prs = pd.DataFrame()
+
+# If labels are accurate, this shouhld generate most of the release notes already
+# We keep track of the included PRs to figure out which ones are missing
+for section_title, module_idx in (
+    ("Backward-incompatible changes", "bc-breaking"),
+    ("Deprecations", "deprecation"),
+    ("New Features", "new feature"),
+    ("Improvements", "enhancement"),
+    ("Bug Fixes", "bug"),
+    ("Code Quality", "code quality"),
+):
+    print(f"## {section_title}")
+    print()
+    tmp_df = mod_df[mod_df[module_idx]]
+    included_prs = pd.concat([included_prs, tmp_df])
+    print(format_prs(tmp_df))
+    print()
+
+
+# In[11]:
+
+
+# Missing PRs are these ones... classify them manually
+missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
+print(format_prs(missing_prs))
diff --git a/scripts/release_notes/retrieve_prs_data.py b/scripts/release_notes/retrieve_prs_data.py
@@ -0,0 +1,212 @@
+import json
+import locale
+import os
+import re
+import subprocess
+from collections import namedtuple
+from os.path import expanduser
+
+import requests
+
+
+Features = namedtuple(
+    "Features",
+    [
+        "title",
+        "body",
+        "pr_number",
+        "files_changed",
+        "labels",
+    ],
+)
+
+
+def dict_to_features(dct):
+    return Features(
+        title=dct["title"],
+        body=dct["body"],
+        pr_number=dct["pr_number"],
+        files_changed=dct["files_changed"],
+        labels=dct["labels"],
+    )
+
+
+def features_to_dict(features):
+    return dict(features._asdict())
+
+
+def run(command):
+    """Returns (return-code, stdout, stderr)"""
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    output, err = p.communicate()
+    rc = p.returncode
+    enc = locale.getpreferredencoding()
+    output = output.decode(enc)
+    err = err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def commit_body(commit_hash):
+    cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
+    ret, out, err = run(cmd)
+    return out if ret == 0 else None
+
+
+def commit_title(commit_hash):
+    cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
+    ret, out, err = run(cmd)
+    return out if ret == 0 else None
+
+
+def commit_files_changed(commit_hash):
+    cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
+    ret, out, err = run(cmd)
+    return out.split("\n") if ret == 0 else None
+
+
+def parse_pr_number(body, commit_hash, title):
+    regex = r"(#[0-9]+)"
+    matches = re.findall(regex, title)
+    if len(matches) == 0:
+        if "revert" not in title.lower() and "updating submodules" not in title.lower():
+            print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
+        return None
+    if len(matches) > 1:
+        print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one")
+        return matches[-1][1:]
+    return matches[0][1:]
+
+
+def get_ghstack_token():
+    pattern = "github_oauth = (.*)"
+    with open(expanduser("~/.ghstackrc"), "r+") as f:
+        config = f.read()
+    matches = re.findall(pattern, config)
+    if len(matches) == 0:
+        raise RuntimeError("Can't find a github oauth token")
+    return matches[0]
+
+
+token = get_ghstack_token()
+headers = {"Authorization": f"token {token}"}
+
+
+def run_query(query):
+    request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers)
+    if request.status_code == 200:
+        return request.json()
+    else:
+        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
+
+
+def gh_labels(pr_number):
+    query = f"""
+    {{
+      repository(owner: "pytorch", name: "vision") {{
+        pullRequest(number: {pr_number}) {{
+          labels(first: 10) {{
+            edges {{
+              node {{
+                name
+              }}
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+    query = run_query(query)
+    edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
+    return [edge["node"]["name"] for edge in edges]
+
+
+def get_features(commit_hash, return_dict=False):
+    title, body, files_changed = (
+        commit_title(commit_hash),
+        commit_body(commit_hash),
+        commit_files_changed(commit_hash),
+    )
+    pr_number = parse_pr_number(body, commit_hash, title)
+    labels = []
+    if pr_number is not None:
+        labels = gh_labels(pr_number)
+    result = Features(title, body, pr_number, files_changed, labels)
+    if return_dict:
+        return features_to_dict(result)
+    return result
+
+
+class CommitDataCache:
+    def __init__(self, path="results/data.json"):
+        self.path = path
+        self.data = {}
+        if os.path.exists(path):
+            self.data = self.read_from_disk()
+
+    def get(self, commit):
+        if commit not in self.data.keys():
+            # Fetch and cache the data
+            self.data[commit] = get_features(commit)
+            self.write_to_disk()
+        return self.data[commit]
+
+    def read_from_disk(self):
+        with open(self.path, "r") as f:
+            data = json.load(f)
+            data = {commit: dict_to_features(dct) for commit, dct in data.items()}
+        return data
+
+    def write_to_disk(self):
+        data = {commit: features._asdict() for commit, features in self.data.items()}
+        with open(self.path, "w") as f:
+            json.dump(data, f)
+
+
+def get_commits_between(base_version, new_version):
+    cmd = f"git merge-base {base_version} {new_version}"
+    rc, merge_base, _ = run(cmd)
+    assert rc == 0
+
+    # Returns a list of something like
+    # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
+    cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
+    rc, commits, _ = run(cmd)
+    assert rc == 0
+
+    log_lines = commits.split("\n")
+    hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
+    return hashes, titles
+
+
+def convert_to_dataframes(feature_list):
+    import pandas as pd
+
+    df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
+    return df
+
+
+def main(base_version, new_version):
+    hashes, titles = get_commits_between(base_version, new_version)
+
+    cdc = CommitDataCache("data.json")
+    for idx, commit in enumerate(hashes):
+        if idx % 10 == 0:
+            print(f"{idx} / {len(hashes)}")
+        cdc.get(commit)
+
+    return cdc
+
+
+if __name__ == "__main__":
+    # d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
+    # print(d)
+    # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
+
+    # Usage: change the tags below accordingly to the current release, then save the json with
+    # cdc.write_to_disk().
+    # Then you can use classify_prs.py (as a notebook)
+    # to open the json and generate the release notes semi-automatically.
+    cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
+    from IPython import embed
+
+    embed()