Skip to content

Commit a6430ef

Browse files
add crd3 dataset (#472)
1 parent b85ae23 commit a6430ef

File tree

3 files changed

+151
-0
lines changed

3 files changed

+151
-0
lines changed

datasets/crd3/crd3.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
# coding=utf-8
2+
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Lint as: python3
17+
"""CRD3 dataset"""
18+
19+
from __future__ import absolute_import, division, print_function
20+
import logging
21+
22+
import json
23+
import os
24+
25+
import nlp
26+
27+
28+
_CITATION = """
29+
@inproceedings{
30+
title = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},
31+
author = {Rameshkumar, Revanth and Bailey, Peter},
32+
year = {2020},
33+
publisher = {Association for Computational Linguistics},
34+
conference = {ACL}
35+
}
36+
"""
37+
38+
_DESCRIPTION = """
39+
Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.
40+
Critical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game.
41+
The dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding
42+
abstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player
43+
collaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail,
44+
and semantic ties to the previous dialogues.
45+
"""
46+
47+
_URL = "https://github.com/RevanthRameshkumar/CRD3/archive/master.zip"
48+
49+
def get_train_test_dev_files(files, test_split, train_split, dev_split):
50+
test_files = dev_files = train_files = []
51+
for file in files:
52+
filename = os.path.split(file)[1].split('_')[0]
53+
if filename in test_split:
54+
test_files.append(file)
55+
elif filename in train_split:
56+
train_files.append(file)
57+
elif filename in dev_split:
58+
dev_files.append(file)
59+
else:
60+
logging.info("skipped file {}".format(file))
61+
return test_files, train_files, dev_files
62+
63+
64+
class CRD3(nlp.GeneratorBasedBuilder):
65+
66+
def _info(self):
67+
return nlp.DatasetInfo(
68+
description=_DESCRIPTION,
69+
features=nlp.Features({
70+
"chunk": nlp.Value("string"),
71+
"chunk_id": nlp.Value("int32"),
72+
"turn_start": nlp.Value("int32"),
73+
"turn_end": nlp.Value("int32"),
74+
"alignment_score": nlp.Value("float32"),
75+
"turn_num": nlp.Value("int32"),
76+
"turns":nlp.features.Sequence({
77+
"names": nlp.Value("string"),
78+
"utterances": nlp.Value("string"),
79+
}),
80+
}),
81+
homepage="https://github.com/RevanthRameshkumar/CRD3",
82+
citation=_CITATION,
83+
)
84+
85+
def _split_generators(self, dl_manager):
86+
path = dl_manager.download_and_extract(_URL)
87+
test_file = os.path.join(path, "CRD3-master", "data", "aligned data", "test_files")
88+
train_file = os.path.join(path, "CRD3-master", "data", "aligned data", "train_files")
89+
dev_file = os.path.join(path, "CRD3-master", "data", "aligned data", "val_files")
90+
with open(test_file) as f:
91+
test_splits = [file.replace("\n", "") for file in f.readlines()]
92+
93+
with open(train_file) as f:
94+
train_splits = [file.replace("\n", "") for file in f.readlines()]
95+
with open(dev_file) as f:
96+
dev_splits = [file.replace("\n", "") for file in f.readlines()]
97+
c2 = "CRD3-master/data/aligned data/c=2"
98+
c3 = "CRD3-master/data/aligned data/c=3"
99+
c4 = "CRD3-master/data/aligned data/c=4"
100+
files = [os.path.join(path, c2, file) for file in sorted(os.listdir(os.path.join(path, c2)))]
101+
files.extend([os.path.join(path, c3, file) for file in sorted(os.listdir(os.path.join(path, c3)))])
102+
files.extend([os.path.join(path, c4, file) for file in sorted(os.listdir(os.path.join(path, c4)))])
103+
104+
test_files, train_files, dev_files = get_train_test_dev_files(files, test_splits, train_splits, dev_splits)
105+
106+
return [
107+
nlp.SplitGenerator(
108+
name=nlp.Split.TRAIN,
109+
gen_kwargs={"files_path": train_files},
110+
),
111+
nlp.SplitGenerator(
112+
name=nlp.Split.TEST,
113+
gen_kwargs={"files_path": test_files},
114+
),
115+
nlp.SplitGenerator(
116+
name=nlp.Split.VALIDATION,
117+
gen_kwargs={"files_path": dev_files},
118+
)
119+
]
120+
121+
def _generate_examples(self, files_path):
122+
"""Yields examples."""
123+
124+
for file in files_path:
125+
with open(file) as f:
126+
data = json.load(f)
127+
for id1, row in enumerate(data):
128+
chunk = row["CHUNK"]
129+
chunk_id = row["ALIGNMENT"]["CHUNK ID"]
130+
turn_start = row["ALIGNMENT"]["TURN START"]
131+
turn_end = row["ALIGNMENT"]["TURN END"]
132+
score = row["ALIGNMENT"]["ALIGNMENT SCORE"]
133+
for id2, turn in enumerate(row['TURNS']):
134+
turn_names = turn["NAMES"]
135+
turn_utterances = turn["UTTERANCES"]
136+
turn_num = turn["NUMBER"]
137+
yield str(id1)+'_'+str(id2), {
138+
"chunk":chunk,
139+
"chunk_id": chunk_id,
140+
"turn_start": turn_start,
141+
"turn_end": turn_end,
142+
"alignment_score": score,
143+
"turn_num": turn_num,
144+
"turns": {
145+
"names": turn_names,
146+
"utterances": turn_utterances,
147+
},
148+
}
149+
150+

datasets/crd3/dataset_infos.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"default": {"description": "\nStorytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset.\nCritical Role is an unscripted, live-streamed show where a fixed group of people play Dungeons and Dragons, an open-ended role-playing game. \nThe dataset is collected from 159 Critical Role episodes transcribed to text dialogues, consisting of 398,682 turns. It also includes corresponding \nabstractive summaries collected from the Fandom wiki. The dataset is linguistically unique in that the narratives are generated entirely through player \ncollaboration and spoken interaction. For each dialogue, there are a large number of turns, multiple abstractive summaries with varying levels of detail, \nand semantic ties to the previous dialogues.\n", "citation": "\n@inproceedings{\ntitle = {Storytelling with Dialogue: A Critical Role Dungeons and Dragons Dataset},\nauthor = {Rameshkumar, Revanth and Bailey, Peter},\nyear = {2020},\npublisher = {Association for Computational Linguistics},\nconference = {ACL}\n}\n ", "homepage": "https://github.com/RevanthRameshkumar/CRD3", "license": "", "features": {"chunk": {"dtype": "string", "id": null, "_type": "Value"}, "chunk_id": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_start": {"dtype": "int32", "id": null, "_type": "Value"}, "turn_end": {"dtype": "int32", "id": null, "_type": "Value"}, "alignment_score": {"dtype": "float32", "id": null, "_type": "Value"}, "turn_num": {"dtype": "int32", "id": null, "_type": "Value"}, "turns": {"feature": {"names": {"dtype": "string", "id": null, "_type": "Value"}, "utterances": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "supervised_keys": null, "builder_name": "cr_d3", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "nlp_version_to_prepare": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "test": {"name": "test", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}, "validation": {"name": "validation", "num_bytes": 1405206229, "num_examples": 2942362, "dataset_name": "cr_d3"}}, "download_checksums": {"https://github.com/RevanthRameshkumar/CRD3/archive/master.zip": {"num_bytes": 293524408, "checksum": "485ee871073c66359320db3a380cc1fa7d8bc05c9c981d87dbf36df91041ff14"}}, "download_size": 293524408, "dataset_size": 4215618687, "size_in_bytes": 4509143095}}
20.8 KB
Binary file not shown.

0 commit comments

Comments
 (0)