From c199db0432e2c285f97dbc63e3a45c14b5370f5b Mon Sep 17 00:00:00 2001
From: Francisco Massa <fvsmassa@gmail.com>
Date: Sat, 5 Jun 2021 14:09:20 +0200
Subject: [PATCH 1/4] Add scripts for facilitating generating release notes

---
 scripts/release_notes/common.py | 238 ++++++++++++++++++++++++++++++++
 1 file changed, 238 insertions(+)
 create mode 100644 scripts/release_notes/common.py

diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py
new file mode 100644
index 00000000000..0712367856c
--- /dev/null
+++ b/scripts/release_notes/common.py
@@ -0,0 +1,238 @@
+from collections import namedtuple
+from os.path import expanduser
+import locale
+import subprocess
+import re
+import requests
+import os
+import json
+
+categories = [
+    'Uncategorized',
+    'distributed',
+    'mobile',
+    'jit',
+    'visualization',
+    'onnx',
+    'caffe2',
+    'quantization',
+    'amd',
+    'benchmark',
+    'profiler',
+    'dispatcher',
+    'releng',
+    'fx',
+    'code_coverage',
+    'vulkan',
+    'skip',
+    'cpp_frontend',
+    'python_frontend',
+    'complex_frontend',
+    'vmap_frontend',
+    'autograd_frontend',
+    'build_frontend',
+    'memory_format_frontend',
+    'foreach_frontend',
+]
+
+topics = [
+    'bc_breaking',
+    'deprecations',
+    'new_features',
+    'improvements',
+    'bug_fixes',
+    'performance',
+    'docs',
+    'devs',
+    'Untopiced',
+]
+
+
+Features = namedtuple('Features', [
+    'title',
+    'body',
+    'pr_number',
+    'files_changed',
+    'labels',
+])
+
+
+def dict_to_features(dct):
+    return Features(
+        title=dct['title'],
+        body=dct['body'],
+        pr_number=dct['pr_number'],
+        files_changed=dct['files_changed'],
+        labels=dct['labels'])
+
+
+def features_to_dict(features):
+    return dict(features._asdict())
+
+
+def run(command):
+    """Returns (return-code, stdout, stderr)"""
+    p = subprocess.Popen(command, stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE, shell=True)
+    output, err = p.communicate()
+    rc = p.returncode
+    enc = locale.getpreferredencoding()
+    output = output.decode(enc)
+    err = err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def commit_body(commit_hash):
+    cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
+    ret, out, err = run(cmd)
+    return out if ret == 0 else None
+
+
+def commit_title(commit_hash):
+    cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
+    ret, out, err = run(cmd)
+    return out if ret == 0 else None
+
+
+def commit_files_changed(commit_hash):
+    cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
+    ret, out, err = run(cmd)
+    return out.split('\n') if ret == 0 else None
+
+
+def parse_pr_number(body, commit_hash, title):
+    regex = r'(#[0-9]+)'
+    matches = re.findall(regex, title)
+    if len(matches) == 0:
+        if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
+            print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
+        return None
+    if len(matches) > 1:
+        print(f'[{commit_hash}: {title}] Got two PR numbers, using the last one')
+        return matches[-1][1:]
+    return matches[0][1:]
+
+
+def get_ghstack_token():
+    pattern = 'github_oauth = (.*)'
+    with open(expanduser('~/.ghstackrc'), 'r+') as f:
+        config = f.read()
+    matches = re.findall(pattern, config)
+    if len(matches) == 0:
+        raise RuntimeError("Can't find a github oauth token")
+    return matches[0]
+
+token = get_ghstack_token()
+headers = {"Authorization": f"token {token}"}
+
+def run_query(query):
+    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
+    if request.status_code == 200:
+        return request.json()
+    else:
+        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
+
+
+def gh_labels(pr_number):
+    query = f"""
+    {{
+      repository(owner: "pytorch", name: "vision") {{
+        pullRequest(number: {pr_number}) {{
+          labels(first: 10) {{
+            edges {{
+              node {{
+                name
+              }}
+            }}
+          }}
+        }}
+      }}
+    }}
+    """
+    query = run_query(query)
+    edges = query['data']['repository']['pullRequest']['labels']['edges']
+    return [edge['node']['name'] for edge in edges]
+
+
+def get_features(commit_hash, return_dict=False):
+    title, body, files_changed = (
+        commit_title(commit_hash),
+        commit_body(commit_hash),
+        commit_files_changed(commit_hash))
+    pr_number = parse_pr_number(body, commit_hash, title)
+    labels = []
+    if pr_number is not None:
+        labels = gh_labels(pr_number)
+    result = Features(title, body, pr_number, files_changed, labels)
+    if return_dict:
+        return features_to_dict(result)
+    return result
+
+class CommitDataCache:
+    def __init__(self, path='results/data.json'):
+        self.path = path
+        self.data = {}
+        if os.path.exists(path):
+            self.data = self.read_from_disk()
+
+    def get(self, commit):
+        if commit not in self.data.keys():
+            # Fetch and cache the data
+            self.data[commit] = get_features(commit)
+            self.write_to_disk()
+        return self.data[commit]
+
+    def read_from_disk(self):
+        with open(self.path, 'r') as f:
+            data = json.load(f)
+            data = {commit: dict_to_features(dct)
+                    for commit, dct in data.items()}
+        return data
+
+    def write_to_disk(self):
+        data = {commit: features._asdict() for commit, features in self.data.items()}
+        with open(self.path, 'w') as f:
+            json.dump(data, f)
+
+
+def get_commits_between(base_version, new_version):
+    cmd = f'git merge-base {base_version} {new_version}'
+    rc, merge_base, _ = run(cmd)
+    assert rc == 0
+
+    # Returns a list of something like
+    # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
+    cmd = f'git log --reverse --oneline {merge_base}..{new_version}'
+    rc, commits, _ = run(cmd)
+    assert rc == 0
+
+    log_lines = commits.split('\n')
+    hashes, titles = zip(*[log_line.split(' ', 1) for log_line in log_lines])
+    return hashes, titles
+
+
+def convert_to_dataframes(feature_list):
+    import pandas as pd
+    df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
+    return df
+
+
+def main(base_version, new_version):
+    hashes, titles = get_commits_between(base_version, new_version)
+    
+    cdc = CommitDataCache('data.json')
+    for idx, commit in enumerate(hashes):
+        if idx % 10 == 0:
+            print(f"{idx} / {len(hashes)}")
+        cdc.get(commit)
+
+    return cdc
+
+
+if __name__ == "__main__":
+    #d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
+    # d = get_features('f5843099d895e72c27ffa9d29cc91dd8df7f3832')
+    # print(d)
+    # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
+    cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
+    from IPython import embed; embed()

From 237c749729481822796895cd77b5221a91764029 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 14 Oct 2021 18:12:30 +0100
Subject: [PATCH 2/4] remove unused lists, and added instructions, applied
 balck

---
 scripts/release_notes/common.py | 150 +++++++++++++-------------------
 1 file changed, 62 insertions(+), 88 deletions(-)

diff --git a/scripts/release_notes/common.py b/scripts/release_notes/common.py
index 0712367856c..603fc7818cb 100644
--- a/scripts/release_notes/common.py
+++ b/scripts/release_notes/common.py
@@ -1,69 +1,34 @@
-from collections import namedtuple
-from os.path import expanduser
+import json
 import locale
-import subprocess
+import os
 import re
+import subprocess
+from collections import namedtuple
+from os.path import expanduser
+
 import requests
-import os
-import json
 
-categories = [
-    'Uncategorized',
-    'distributed',
-    'mobile',
-    'jit',
-    'visualization',
-    'onnx',
-    'caffe2',
-    'quantization',
-    'amd',
-    'benchmark',
-    'profiler',
-    'dispatcher',
-    'releng',
-    'fx',
-    'code_coverage',
-    'vulkan',
-    'skip',
-    'cpp_frontend',
-    'python_frontend',
-    'complex_frontend',
-    'vmap_frontend',
-    'autograd_frontend',
-    'build_frontend',
-    'memory_format_frontend',
-    'foreach_frontend',
-]
-
-topics = [
-    'bc_breaking',
-    'deprecations',
-    'new_features',
-    'improvements',
-    'bug_fixes',
-    'performance',
-    'docs',
-    'devs',
-    'Untopiced',
-]
-
-
-Features = namedtuple('Features', [
-    'title',
-    'body',
-    'pr_number',
-    'files_changed',
-    'labels',
-])
+
+Features = namedtuple(
+    "Features",
+    [
+        "title",
+        "body",
+        "pr_number",
+        "files_changed",
+        "labels",
+    ],
+)
 
 
 def dict_to_features(dct):
     return Features(
-        title=dct['title'],
-        body=dct['body'],
-        pr_number=dct['pr_number'],
-        files_changed=dct['files_changed'],
-        labels=dct['labels'])
+        title=dct["title"],
+        body=dct["body"],
+        pr_number=dct["pr_number"],
+        files_changed=dct["files_changed"],
+        labels=dct["labels"],
+    )
 
 
 def features_to_dict(features):
@@ -72,8 +37,7 @@ def features_to_dict(features):
 
 def run(command):
     """Returns (return-code, stdout, stderr)"""
-    p = subprocess.Popen(command, stdout=subprocess.PIPE,
-                         stderr=subprocess.PIPE, shell=True)
+    p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
     output, err = p.communicate()
     rc = p.returncode
     enc = locale.getpreferredencoding()
@@ -83,50 +47,52 @@ def run(command):
 
 
 def commit_body(commit_hash):
-    cmd = f'git log -n 1 --pretty=format:%b {commit_hash}'
+    cmd = f"git log -n 1 --pretty=format:%b {commit_hash}"
     ret, out, err = run(cmd)
     return out if ret == 0 else None
 
 
 def commit_title(commit_hash):
-    cmd = f'git log -n 1 --pretty=format:%s {commit_hash}'
+    cmd = f"git log -n 1 --pretty=format:%s {commit_hash}"
     ret, out, err = run(cmd)
     return out if ret == 0 else None
 
 
 def commit_files_changed(commit_hash):
-    cmd = f'git diff-tree --no-commit-id --name-only -r {commit_hash}'
+    cmd = f"git diff-tree --no-commit-id --name-only -r {commit_hash}"
     ret, out, err = run(cmd)
-    return out.split('\n') if ret == 0 else None
+    return out.split("\n") if ret == 0 else None
 
 
 def parse_pr_number(body, commit_hash, title):
-    regex = r'(#[0-9]+)'
+    regex = r"(#[0-9]+)"
     matches = re.findall(regex, title)
     if len(matches) == 0:
-        if 'revert' not in title.lower() and 'updating submodules' not in title.lower():
-            print(f'[{commit_hash}: {title}] Could not parse PR number, ignoring PR')
+        if "revert" not in title.lower() and "updating submodules" not in title.lower():
+            print(f"[{commit_hash}: {title}] Could not parse PR number, ignoring PR")
         return None
     if len(matches) > 1:
-        print(f'[{commit_hash}: {title}] Got two PR numbers, using the last one')
+        print(f"[{commit_hash}: {title}] Got two PR numbers, using the last one")
         return matches[-1][1:]
     return matches[0][1:]
 
 
 def get_ghstack_token():
-    pattern = 'github_oauth = (.*)'
-    with open(expanduser('~/.ghstackrc'), 'r+') as f:
+    pattern = "github_oauth = (.*)"
+    with open(expanduser("~/.ghstackrc"), "r+") as f:
         config = f.read()
     matches = re.findall(pattern, config)
     if len(matches) == 0:
         raise RuntimeError("Can't find a github oauth token")
     return matches[0]
 
+
 token = get_ghstack_token()
 headers = {"Authorization": f"token {token}"}
 
+
 def run_query(query):
-    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
+    request = requests.post("https://api.github.com/graphql", json={"query": query}, headers=headers)
     if request.status_code == 200:
         return request.json()
     else:
@@ -150,15 +116,16 @@ def gh_labels(pr_number):
     }}
     """
     query = run_query(query)
-    edges = query['data']['repository']['pullRequest']['labels']['edges']
-    return [edge['node']['name'] for edge in edges]
+    edges = query["data"]["repository"]["pullRequest"]["labels"]["edges"]
+    return [edge["node"]["name"] for edge in edges]
 
 
 def get_features(commit_hash, return_dict=False):
     title, body, files_changed = (
         commit_title(commit_hash),
         commit_body(commit_hash),
-        commit_files_changed(commit_hash))
+        commit_files_changed(commit_hash),
+    )
     pr_number = parse_pr_number(body, commit_hash, title)
     labels = []
     if pr_number is not None:
@@ -168,8 +135,9 @@ def get_features(commit_hash, return_dict=False):
         return features_to_dict(result)
     return result
 
+
 class CommitDataCache:
-    def __init__(self, path='results/data.json'):
+    def __init__(self, path="results/data.json"):
         self.path = path
         self.data = {}
         if os.path.exists(path):
@@ -183,44 +151,44 @@ def get(self, commit):
         return self.data[commit]
 
     def read_from_disk(self):
-        with open(self.path, 'r') as f:
+        with open(self.path, "r") as f:
             data = json.load(f)
-            data = {commit: dict_to_features(dct)
-                    for commit, dct in data.items()}
+            data = {commit: dict_to_features(dct) for commit, dct in data.items()}
         return data
 
     def write_to_disk(self):
         data = {commit: features._asdict() for commit, features in self.data.items()}
-        with open(self.path, 'w') as f:
+        with open(self.path, "w") as f:
             json.dump(data, f)
 
 
 def get_commits_between(base_version, new_version):
-    cmd = f'git merge-base {base_version} {new_version}'
+    cmd = f"git merge-base {base_version} {new_version}"
     rc, merge_base, _ = run(cmd)
     assert rc == 0
 
     # Returns a list of something like
     # b33e38ec47 Allow a higher-precision step type for Vec256::arange (#34555)
-    cmd = f'git log --reverse --oneline {merge_base}..{new_version}'
+    cmd = f"git log --reverse --oneline {merge_base}..{new_version}"
     rc, commits, _ = run(cmd)
     assert rc == 0
 
-    log_lines = commits.split('\n')
-    hashes, titles = zip(*[log_line.split(' ', 1) for log_line in log_lines])
+    log_lines = commits.split("\n")
+    hashes, titles = zip(*[log_line.split(" ", 1) for log_line in log_lines])
     return hashes, titles
 
 
 def convert_to_dataframes(feature_list):
     import pandas as pd
+
     df = pd.DataFrame.from_records(feature_list, columns=Features._fields)
     return df
 
 
 def main(base_version, new_version):
     hashes, titles = get_commits_between(base_version, new_version)
-    
-    cdc = CommitDataCache('data.json')
+
+    cdc = CommitDataCache("data.json")
     for idx, commit in enumerate(hashes):
         if idx % 10 == 0:
             print(f"{idx} / {len(hashes)}")
@@ -230,9 +198,15 @@ def main(base_version, new_version):
 
 
 if __name__ == "__main__":
-    #d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
-    # d = get_features('f5843099d895e72c27ffa9d29cc91dd8df7f3832')
+    # d = get_features('2ab93592529243862ce8ad5b6acf2628ef8d0dc8')
     # print(d)
     # hashes, titles = get_commits_between("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
+
+    # Usage: change the tags below accordingly to the current release, then save the json with
+    # cdc.write_to_disk().
+    # Then you can rely on https://nbviewer.org/gist/NicolasHug/248a72aa086fd874936f16eb56240226
+    # to open the json and generate the release notes semi-automatically.
     cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
-    from IPython import embed; embed()
+    from IPython import embed
+
+    embed()

From afbeb47c44c5679561dc7b17441c1637174778af Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 14 Oct 2021 18:13:31 +0100
Subject: [PATCH 3/4] renamed file

---
 scripts/release_notes/{common.py => retrieve_prs_data.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename scripts/release_notes/{common.py => retrieve_prs_data.py} (100%)

diff --git a/scripts/release_notes/common.py b/scripts/release_notes/retrieve_prs_data.py
similarity index 100%
rename from scripts/release_notes/common.py
rename to scripts/release_notes/retrieve_prs_data.py

From c4f962df5621e87a970dd00870cbbf9498678d67 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <nicolashug@fb.com>
Date: Thu, 14 Oct 2021 18:48:01 +0100
Subject: [PATCH 4/4] Added pandas script directly

---
 scripts/release_notes/classify_prs.py      | 121 +++++++++++++++++++++
 scripts/release_notes/retrieve_prs_data.py |   2 +-
 2 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 scripts/release_notes/classify_prs.py

diff --git a/scripts/release_notes/classify_prs.py b/scripts/release_notes/classify_prs.py
new file mode 100644
index 00000000000..580e93bfe8b
--- /dev/null
+++ b/scripts/release_notes/classify_prs.py
@@ -0,0 +1,121 @@
+# In[1]:
+
+
+import pandas as pd
+
+
+# In[2]:
+
+
+df = pd.read_json("10.0_to_11.0-rc2.json").T
+df.tail()
+
+
+# In[3]:
+
+
+all_labels = set(lbl for labels in df["labels"] for lbl in labels)
+all_labels
+
+
+# In[4]:
+
+
+# Add one column per label
+for label in all_labels:
+    df[label] = df["labels"].apply(lambda labels_list: label in labels_list)
+df.head()
+
+
+# In[5]:
+
+
+# Add a clean "module" column. It contains tuples since PRs can have more than one module.
+# Maybe we should include "topics" in that column as well?
+
+all_modules = {  # mapping: full name -> clean name
+    label: "".join(label.split(" ")[1:]) for label in all_labels if label.startswith("module")
+}
+
+# We use an ugly loop, but whatever ¯\_(ツ)_/¯
+df["module"] = [[] for _ in range(len(df))]
+for i, row in df.iterrows():
+    for full_name, clean_name in all_modules.items():
+        if full_name in row["labels"]:
+            row["module"].append(clean_name)
+df["module"] = df.module.apply(tuple)
+df.head()
+
+
+# In[6]:
+
+
+mod_df = df.set_index("module").sort_index()
+mod_df.tail()
+
+
+# In[7]:
+
+
+# All improvement PRs
+mod_df[mod_df["enhancement"]].head()
+
+
+# In[8]:
+
+
+# improvement f module
+# note: don't filter module name on the index as the index contain tuples with non-exclusive values
+# Use the boolean column instead
+mod_df[mod_df["enhancement"] & mod_df["module: transforms"]]
+
+
+# In[9]:
+
+
+def format_prs(mod_df):
+    out = []
+    for idx, row in mod_df.iterrows():
+        modules = idx
+        # Put "documentation" and "tests" first for sorting to be dece
+        for last_module in ("documentation", "tests"):
+            if last_module in modules:
+                modules = [m for m in modules if m != last_module] + [last_module]
+
+        module = f"[{', '.join(modules)}]"
+        module = module.replace("referencescripts", "reference scripts")
+        module = module.replace("code", "reference scripts")
+        out.append(f"{module} {row['title']}")
+
+    return "\n".join(out)
+
+
+# In[10]:
+
+
+included_prs = pd.DataFrame()
+
+# If labels are accurate, this shouhld generate most of the release notes already
+# We keep track of the included PRs to figure out which ones are missing
+for section_title, module_idx in (
+    ("Backward-incompatible changes", "bc-breaking"),
+    ("Deprecations", "deprecation"),
+    ("New Features", "new feature"),
+    ("Improvements", "enhancement"),
+    ("Bug Fixes", "bug"),
+    ("Code Quality", "code quality"),
+):
+    print(f"## {section_title}")
+    print()
+    tmp_df = mod_df[mod_df[module_idx]]
+    included_prs = pd.concat([included_prs, tmp_df])
+    print(format_prs(tmp_df))
+    print()
+
+
+# In[11]:
+
+
+# Missing PRs are these ones... classify them manually
+missing_prs = pd.concat([mod_df, included_prs]).drop_duplicates(subset="pr_number", keep=False)
+print(format_prs(missing_prs))
diff --git a/scripts/release_notes/retrieve_prs_data.py b/scripts/release_notes/retrieve_prs_data.py
index 603fc7818cb..90cb4cda07e 100644
--- a/scripts/release_notes/retrieve_prs_data.py
+++ b/scripts/release_notes/retrieve_prs_data.py
@@ -204,7 +204,7 @@ def main(base_version, new_version):
 
     # Usage: change the tags below accordingly to the current release, then save the json with
     # cdc.write_to_disk().
-    # Then you can rely on https://nbviewer.org/gist/NicolasHug/248a72aa086fd874936f16eb56240226
+    # Then you can use classify_prs.py (as a notebook)
     # to open the json and generate the release notes semi-automatically.
     cdc = main("tags/v0.9.0", "fc852f3b39fe25dd8bf1dedee8f19ea04aa84c15")
     from IPython import embed