add crd3 dataset (#472)

mariamabarham · web-flow · commit a6430ef8b703 · 2020-08-03T13:22:08.000+02:00
diff --git a/datasets/crd3/crd3.py b/datasets/crd3/crd3.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""CRD3  dataset"""
+
+from __future__ import absolute_import, division, print_function
+import logging
+
+import json
+import os
+
+import nlp
+
+
+_CITATION = """
+@inproceedings{
+title = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},
+author = {Rameshkumar, Revanth  and Bailey, Peter},
+year = {2020},
+publisher = {Association for Computational Linguistics},
+conference = {ACL}
+}
+ """
+
+_DESCRIPTION = """
+Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.
+Critical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. 
+The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding 
+abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player 
+collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, 
+and semantic ties to the previous dialogues.
+"""
+
+_URL = "https://github.com/RevanthRameshkumar/CRD3/archive/master.zip"
+
+def get_train_test_dev_files(files, test_split, train_split, dev_split):
+    test_files = dev_files = train_files = []
+    for file in files:
+        filename = os.path.split(file)[1].split('_')[0]
+        if filename in test_split:
+            test_files.append(file)
+        elif filename in train_split:
+            train_files.append(file)
+        elif filename in dev_split:
+            dev_files.append(file)
+        else:
+            logging.info("skipped file {}".format(file))
+    return test_files, train_files, dev_files
+    
+
+class CRD3(nlp.GeneratorBasedBuilder):
+    
+    def _info(self):
+        return nlp.DatasetInfo(
+            description=_DESCRIPTION,
+            features=nlp.Features({
+                "chunk": nlp.Value("string"),
+                "chunk_id": nlp.Value("int32"),
+                "turn_start": nlp.Value("int32"),
+                "turn_end": nlp.Value("int32"),
+                "alignment_score": nlp.Value("float32"),
+                "turn_num": nlp.Value("int32"),
+                "turns":nlp.features.Sequence({
+                    "names": nlp.Value("string"),
+                    "utterances": nlp.Value("string"),
+            }),
+            }),
+            homepage="https://github.com/RevanthRameshkumar/CRD3",
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        path = dl_manager.download_and_extract(_URL)
+        test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files")
+        train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files")
+        dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files")
+        with open(test_file) as f:
+            test_splits = [file.replace("\n", "") for file in f.readlines()]
+            
+        with open(train_file) as f:
+            train_splits = [file.replace("\n", "") for file in f.readlines()]
+        with open(dev_file) as f:
+            dev_splits = [file.replace("\n", "") for file in f.readlines()]
+        c2 = "CRD3-master/data/aligned data/c=2"
+        c3 = "CRD3-master/data/aligned data/c=3"
+        c4 = "CRD3-master/data/aligned data/c=4"
+        files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))]
+        files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))])  
+        files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))])  
+        
+        test_files, train_files, dev_files = get_train_test_dev_files(files, test_splits, train_splits, dev_splits)
+    
+        return [
+            nlp.SplitGenerator(
+                name=nlp.Split.TRAIN,
+                gen_kwargs={"files_path": train_files},
+            ),
+            nlp.SplitGenerator(
+                name=nlp.Split.TEST,
+                gen_kwargs={"files_path": test_files},
+            ),
+            nlp.SplitGenerator(
+                name=nlp.Split.VALIDATION,
+                gen_kwargs={"files_path": dev_files},
+            )
+        ]
+
+    def _generate_examples(self, files_path):
+        """Yields examples."""
+        
+        for file in files_path:
+            with open(file) as f:
+                data = json.load(f)
+                for id1, row in enumerate(data):
+                    chunk = row["CHUNK"]
+                    chunk_id = row["ALIGNMENT"]["CHUNK ID"]
+                    turn_start = row["ALIGNMENT"]["TURN START"]
+                    turn_end = row["ALIGNMENT"]["TURN END"]
+                    score = row["ALIGNMENT"]["ALIGNMENT SCORE"]
+                    for id2, turn in enumerate(row['TURNS']):
+                        turn_names = turn["NAMES"]
+                        turn_utterances = turn["UTTERANCES"]
+                        turn_num = turn["NUMBER"]
+                        yield str(id1)+'_'+str(id2), {
+                            "chunk":chunk,
+                            "chunk_id": chunk_id,
+                            "turn_start": turn_start,
+                            "turn_end": turn_end,
+                            "alignment_score": score,
+                            "turn_num": turn_num,
+                            "turns": {
+                                "names": turn_names,
+                                "utterances": turn_utterances,
+                        },
+                        }
+                            
+
diff --git a/datasets/crd3/dataset_infos.json b/datasets/crd3/dataset_infos.json
@@ -0,0 +1 @@
+{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. \nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding \nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player \ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, \nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth  and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turn_num": {"dtype": "int32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "cr_d3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "test": {"name": "test", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "validation": {"name": "validation", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 293524408, "checksum": "485ee871073c66359320db3a380cc1fa7d8bc05c9c981d87dbf36df91041ff14"}}, "download_size": 293524408, "dataset_size": 4215618687, "size_in_bytes": 4509143095}}
diff --git a/datasets/crd3/dummy/0.0.0/dummy_data.zip b/datasets/crd3/dummy/0.0.0/dummy_data.zip

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. \nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding \nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player \ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, \nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turn_num": {"dtype": "int32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "cr_d3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "test": {"name": "test", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "validation": {"name": "validation", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 293524408, "checksum": "485ee871073c66359320db3a380cc1fa7d8bc05c9c981d87dbf36df91041ff14"}}, "download_size": 293524408, "dataset_size": 4215618687, "size_in_bytes": 4509143095}}