You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
@string{acl = {Association for Computational Linguistics,}}
2
2
3
3
@inproceedings{bauwens-delobelle-2024-bpe,
4
-
title = "{BPE}-knockout: Pruning Pre-existing {BPE} Tokenisers with Backwards-compatible Morphological Semi-supervision",
5
-
author = "Bauwens, Thomas and Delobelle, Pieter",
6
-
booktitle = "Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
7
-
month = jun,
8
-
year = "2024",
9
-
address = "Mexico City, Mexico",
10
-
publisher = "Association for Computational Linguistics",
abstract = "Byte-pair encoding (BPE) has become the default subword tokeniser in language models (LMs), allowing the representation of an infinite space of text with a finite set of units. Yet, BPE training is unsupervised, receiving no explicit information about a language{'}s morphology. This results in a subword vocabulary wherein many units are a concatenation of partial morphemes, preventing their formation as tokens. This, in turn, causes consistent intra-word patterns to be displayed inconsistently to downstream models, and bloats the vocabulary, hence requiring unnecessary embedding storage. In this paper, we address this issue by identifying blameworthy BPE merges and removing the resulting subwords from the BPE vocabulary, without impeding further use of merges that relied on them. We find that our method, BPE-knockout, is effective at making BPE{'}s segmentation positions adhere better to derivational and compound boundaries in English, Dutch and German, and improves token-based tasks in Dutch RoBERTa models, indicating that a tokeniser{'}s adherence to morphology impacts downstream models. We demonstrate the latter not only by training LMs from scratch, but also by continuing the pre-training of existing LMs. This proves promising, showing that suboptimal tokenisers can be remedied whilst salvaging training cost of downstream LMs.",
4
+
title = {{BPE}-knockout: Pruning Pre-existing {BPE} Tokenisers with Backwards-compatible Morphological Semi-supervision},
5
+
author = {Bauwens, Thomas and Delobelle, Pieter},
6
+
booktitle = {Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
7
+
month = jun,
8
+
year = {2024},
9
+
address = {Mexico City, Mexico},
10
+
publisher = {Association for Computational Linguistics},
abstract = {Byte-pair encoding (BPE) has become the default subword tokeniser in language models (LMs), allowing the representation of an infinite space of text with a finite set of units. Yet, BPE training is unsupervised, receiving no explicit information about a language{'}s morphology. This results in a subword vocabulary wherein many units are a concatenation of partial morphemes, preventing their formation as tokens. This, in turn, causes consistent intra-word patterns to be displayed inconsistently to downstream models, and bloats the vocabulary, hence requiring unnecessary embedding storage. In this paper, we address this issue by identifying blameworthy BPE merges and removing the resulting subwords from the BPE vocabulary, without impeding further use of merges that relied on them. We find that our method, BPE-knockout, is effective at making BPE{'}s segmentation positions adhere better to derivational and compound boundaries in English, Dutch and German, and improves token-based tasks in Dutch RoBERTa models, indicating that a tokeniser{'}s adherence to morphology impacts downstream models. We demonstrate the latter not only by training LMs from scratch, but also by continuing the pre-training of existing LMs. This proves promising, showing that suboptimal tokenisers can be remedied whilst salvaging training cost of downstream LMs.},
title = {What Is ''{{Typological Diversity}}'' in {{NLP}}?},
24
+
author = {Ploeger*, Esther and Poelman*, Wessel and {de Lhoneux}, Miryam and Bjerva, Johannes},
25
+
booktitle = {To appear at the 2024 Conference on Empirical Methods in Natural Language Processing},
26
+
year = {2024},
27
+
month = nov,
28
+
publisher = {Association for Computational Linguistics},
29
+
url = {http://arxiv.org/abs/2402.04222},
30
+
abstract = {The NLP research community has devoted increased attention to languages beyond English, resulting in considerable improvements for multilingual NLP. However, these improvements only apply to a small subset of the world's languages. Aiming to extend this, an increasing number of papers aspires to enhance generalizable multilingual performance across languages. To this end, linguistic typology is commonly used to motivate language selection, on the basis that a broad typological sample ought to imply generalization across a broad range of languages. These selections are often described as being 'typologically diverse'. In this work, we systematically investigate NLP research that includes claims regarding 'typological diversity'. We find there are no set definitions or criteria for such claims. We introduce metrics to approximate the diversity of language selection along several axes and find that the results vary considerably across papers. Furthermore, we show that skewed language selection can lead to overestimated multilingual performance. We recommend future work to include an operationalization of 'typological diversity' that empirically justifies the diversity of language samples.},
Copy file name to clipboardExpand all lines: _bibliography/preprints.bib
-19Lines changed: 0 additions & 19 deletions
Original file line number
Diff line number
Diff line change
@@ -1,22 +1,3 @@
1
-
2
-
@misc{ploeger2024what,
3
-
title = {What Is ''{{Typological Diversity}}'' in {{NLP}}?},
4
-
author = {Ploeger*, Esther and Poelman*, Wessel and {de Lhoneux}, Miryam and Bjerva, Johannes},
5
-
year = {2024},
6
-
month = feb,
7
-
number = {arXiv:2402.04222},
8
-
eprint = {2402.04222},
9
-
primaryclass = {cs},
10
-
publisher = {arXiv},
11
-
url = {http://arxiv.org/abs/2402.04222},
12
-
urldate = {2024-02-07},
13
-
abstract = {The NLP research community has devoted increased attention to languages beyond English, resulting in considerable improvements for multilingual NLP. However, these improvements only apply to a small subset of the world's languages. Aiming to extend this, an increasing number of papers aspires to enhance generalizable multilingual performance across languages. To this end, linguistic typology is commonly used to motivate language selection, on the basis that a broad typological sample ought to imply generalization across a broad range of languages. These selections are often described as being 'typologically diverse'. In this work, we systematically investigate NLP research that includes claims regarding 'typological diversity'. We find there are no set definitions or criteria for such claims. We introduce metrics to approximate the diversity of language selection along several axes and find that the results vary considerably across papers. Furthermore, we show that skewed language selection can lead to overestimated multilingual performance. We recommend future work to include an operationalization of 'typological diversity' that empirically justifies the diversity of language samples.},
14
-
archiveprefix = {arxiv},
15
-
keywords = {Computer Science - Computation and Language},
16
-
abbr = {arXiv},
17
-
bibtex_show = true
18
-
}
19
-
20
1
@misc{ploeger2024principled,
21
2
title = {A {{Principled Framework}} for {{Evaluating}} on {{Typologically Diverse Languages}}},
22
3
author = {Ploeger, Esther and Poelman, Wessel and {H{\o}eg-Petersen}, Andreas Holck and Schlichtkrull, Anders and {de Lhoneux}, Miryam and Bjerva, Johannes},
0 commit comments