1
1
from torchnlp .word_to_vector .pretrained_word_vectors import _PretrainedWordVectors
2
2
3
-
4
3
# List of all 275 supported languages from http://cosyne.h-its.org/bpemb/data/
5
4
SUPPORTED_LANGUAGES = [
6
- 'ab' , 'ace' , 'ady' , 'af' , 'ak' , 'als' , 'am' , 'an' , 'ang' , 'ar' , 'arc' ,
7
- 'arz' , 'as' , 'ast' , 'atj' , 'av' , 'ay' , 'az' , 'azb' , 'ba' , 'bar' , 'bcl' ,
8
- 'be' , 'bg' , 'bi' , 'bjn' , 'bm' , 'bn' , 'bo' , 'bpy' , 'br' , 'bs' , 'bug' , 'bxr' ,
9
- 'ca' , 'cdo' , 'ce' , 'ceb' , 'ch' , 'chr' , 'chy' , 'ckb' , 'co' , 'cr' , 'crh' ,
10
- 'cs' , 'csb' , 'cu' , 'cv' , 'cy' , 'da' , 'de' , 'din' , 'diq' , 'dsb' , 'dty' , 'dv' ,
11
- 'dz' , 'ee' , 'el' , 'en' , 'eo' , 'es' , 'et' , 'eu' , 'ext' , 'fa' , 'ff' , 'fi' ,
12
- 'fj' , 'fo' , 'fr' , 'frp' , 'frr' , 'fur' , 'fy' , 'ga' , 'gag' , 'gan' , 'gd' , 'gl' ,
13
- 'glk' , 'gn' , 'gom' , 'got' , 'gu' , 'gv' , 'ha' , 'hak' , 'haw' , 'he' , 'hi' ,
14
- 'hif' , 'hr' , 'hsb' , 'ht' , 'hu' , 'hy' , 'ia' , 'id' , 'ie' , 'ig' , 'ik' , 'ilo' ,
15
- 'io' , 'is' , 'it' , 'iu' , 'ja' , 'jam' , 'jbo' , 'jv' , 'ka' , 'kaa' , 'kab' , 'kbd' ,
16
- 'kbp' , 'kg' , 'ki' , 'kk' , 'kl' , 'km' , 'kn' , 'ko' , 'koi' , 'krc' , 'ks' , 'ksh' ,
17
- 'ku' , 'kv' , 'kw' , 'ky' , 'la' , 'lad' , 'lb' , 'lbe' , 'lez' , 'lg' , 'li' , 'lij' ,
18
- 'lmo' , 'ln' , 'lo' , 'lrc' , 'lt' , 'ltg' , 'lv' , 'mai' , 'mdf' , 'mg' , 'mh' ,
19
- 'mhr' , 'mi' , 'min' , 'mk' , 'ml' , 'mn' , 'mr' , 'mrj' , 'ms' , 'mt' , 'mwl' , 'my' ,
20
- 'myv' , 'mzn' , 'na' , 'nap' , 'nds' , 'ne' , 'new' , 'ng' , 'nl' , 'nn' , 'no' ,
21
- 'nov' , 'nrm' , 'nso' , 'nv' , 'ny' , 'oc' , 'olo' , 'om' , 'or' , 'os' , 'pa' , 'pag' ,
22
- 'pam' , 'pap' , 'pcd' , 'pdc' , 'pfl' , 'pi' , 'pih' , 'pl' , 'pms' , 'pnb' , 'pnt' ,
23
- 'ps' , 'pt' , 'qu' , 'rm' , 'rmy' , 'rn' , 'ro' , 'ru' , 'rue' , 'rw' , 'sa' , 'sah' ,
24
- 'sc' , 'scn' , 'sco' , 'sd' , 'se' , 'sg' , 'sh' , 'si' , 'sk' , 'sl' , 'sm' , 'sn' ,
25
- 'so' , 'sq' , 'sr' , 'srn' , 'ss' , 'st' , 'stq' , 'su' , 'sv' , 'sw' , 'szl' , 'ta' ,
26
- 'tcy' , 'te' , 'tet' , 'tg' , 'th' , 'ti' , 'tk' , 'tl' , 'tn' , 'to' , 'tpi' , 'tr' ,
27
- 'ts' , 'tt' , 'tum' , 'tw' , 'ty' , 'tyv' , 'udm' , 'ug' , 'uk' , 'ur' , 'uz' , 've' ,
28
- 'vec' , 'vep' , 'vi' , 'vls' , 'vo' , 'wa' , 'war' , 'wo' , 'wuu' , 'xal' , 'xh' ,
29
- 'xmf' , 'yi' , 'yo' , 'za' , 'zea' , 'zh' , 'zu'
5
+ 'ab' , 'ace' , 'ady' , 'af' , 'ak' , 'als' , 'am' , 'an' , 'ang' , 'ar' , 'arc' , 'arz' , 'as' , 'ast' ,
6
+ 'atj' , 'av' , 'ay' , 'az' , 'azb' , 'ba' , 'bar' , 'bcl' , 'be' , 'bg' , 'bi' , 'bjn' , 'bm' , 'bn' , 'bo' ,
7
+ 'bpy' , 'br' , 'bs' , 'bug' , 'bxr' , 'ca' , 'cdo' , 'ce' , 'ceb' , 'ch' , 'chr' , 'chy' , 'ckb' , 'co' ,
8
+ 'cr' , 'crh' , 'cs' , 'csb' , 'cu' , 'cv' , 'cy' , 'da' , 'de' , 'din' , 'diq' , 'dsb' , 'dty' , 'dv' , 'dz' ,
9
+ 'ee' , 'el' , 'en' , 'eo' , 'es' , 'et' , 'eu' , 'ext' , 'fa' , 'ff' , 'fi' , 'fj' , 'fo' , 'fr' , 'frp' ,
10
+ 'frr' , 'fur' , 'fy' , 'ga' , 'gag' , 'gan' , 'gd' , 'gl' , 'glk' , 'gn' , 'gom' , 'got' , 'gu' , 'gv' , 'ha' ,
11
+ 'hak' , 'haw' , 'he' , 'hi' , 'hif' , 'hr' , 'hsb' , 'ht' , 'hu' , 'hy' , 'ia' , 'id' , 'ie' , 'ig' , 'ik' ,
12
+ 'ilo' , 'io' , 'is' , 'it' , 'iu' , 'ja' , 'jam' , 'jbo' , 'jv' , 'ka' , 'kaa' , 'kab' , 'kbd' , 'kbp' , 'kg' ,
13
+ 'ki' , 'kk' , 'kl' , 'km' , 'kn' , 'ko' , 'koi' , 'krc' , 'ks' , 'ksh' , 'ku' , 'kv' , 'kw' , 'ky' , 'la' ,
14
+ 'lad' , 'lb' , 'lbe' , 'lez' , 'lg' , 'li' , 'lij' , 'lmo' , 'ln' , 'lo' , 'lrc' , 'lt' , 'ltg' , 'lv' ,
15
+ 'mai' , 'mdf' , 'mg' , 'mh' , 'mhr' , 'mi' , 'min' , 'mk' , 'ml' , 'mn' , 'mr' , 'mrj' , 'ms' , 'mt' , 'mwl' ,
16
+ 'my' , 'myv' , 'mzn' , 'na' , 'nap' , 'nds' , 'ne' , 'new' , 'ng' , 'nl' , 'nn' , 'no' , 'nov' , 'nrm' ,
17
+ 'nso' , 'nv' , 'ny' , 'oc' , 'olo' , 'om' , 'or' , 'os' , 'pa' , 'pag' , 'pam' , 'pap' , 'pcd' , 'pdc' ,
18
+ 'pfl' , 'pi' , 'pih' , 'pl' , 'pms' , 'pnb' , 'pnt' , 'ps' , 'pt' , 'qu' , 'rm' , 'rmy' , 'rn' , 'ro' , 'ru' ,
19
+ 'rue' , 'rw' , 'sa' , 'sah' , 'sc' , 'scn' , 'sco' , 'sd' , 'se' , 'sg' , 'sh' , 'si' , 'sk' , 'sl' , 'sm' ,
20
+ 'sn' , 'so' , 'sq' , 'sr' , 'srn' , 'ss' , 'st' , 'stq' , 'su' , 'sv' , 'sw' , 'szl' , 'ta' , 'tcy' , 'te' ,
21
+ 'tet' , 'tg' , 'th' , 'ti' , 'tk' , 'tl' , 'tn' , 'to' , 'tpi' , 'tr' , 'ts' , 'tt' , 'tum' , 'tw' , 'ty' ,
22
+ 'tyv' , 'udm' , 'ug' , 'uk' , 'ur' , 'uz' , 've' , 'vec' , 'vep' , 'vi' , 'vls' , 'vo' , 'wa' , 'war' , 'wo' ,
23
+ 'wuu' , 'xal' , 'xh' , 'xmf' , 'yi' , 'yo' , 'za' , 'zea' , 'zh' , 'zu'
30
24
]
31
25
32
26
# All supported vector dimensionalities for which embeddings were trained
@@ -40,6 +34,11 @@ class BPEmb(_PretrainedWordVectors):
40
34
"""
41
35
Byte-Pair Encoding (BPE) embeddings trained on Wikipedia for 275 languages
42
36
37
+ A collection of pre-trained subword unit embeddings in 275 languages, based
38
+ on Byte-Pair Encoding (BPE). In an evaluation using fine-grained entity typing as testbed,
39
+ BPEmb performs competitively, and for some languages better than alternative subword
40
+ approaches, while requiring vastly fewer resources and no tokenization.
41
+
43
42
References:
44
43
* https://arxiv.org/abs/1710.02187
45
44
* https://github.com/bheinzerling/bpemb
@@ -78,24 +77,23 @@ def __init__(self, language='en', dim=300, merge_ops=50000, **kwargs):
78
77
# Check if all parameters are valid
79
78
if language not in SUPPORTED_LANGUAGES :
80
79
raise ValueError (("Language '%s' not supported. Use one of the "
81
- "following options instead:\n %s"
82
- ) % (language , SUPPORTED_LANGUAGES ))
80
+ "following options instead:\n %s" ) % (language , SUPPORTED_LANGUAGES ))
83
81
if dim not in SUPPORTED_DIMS :
84
82
raise ValueError (("Embedding dimensionality of '%d' not supported. "
85
- "Use one of the following options instead:\n %s"
86
- ) % ( dim , SUPPORTED_DIMS ))
83
+ "Use one of the following options instead:\n %s" ) % ( dim ,
84
+ SUPPORTED_DIMS ))
87
85
if merge_ops not in SUPPORTED_MERGE_OPS :
88
86
raise ValueError (("Number of '%d' merge operations not supported. "
89
- "Use one of the following options instead:\n %s"
90
- ) % (merge_ops , SUPPORTED_MERGE_OPS ))
87
+ "Use one of the following options instead:\n %s" ) %
88
+ (merge_ops , SUPPORTED_MERGE_OPS ))
91
89
92
90
format_map = {'language' : language , 'merge_ops' : merge_ops , 'dim' : dim }
93
91
94
92
# Assemble file name to locally store embeddings under
95
93
name = self .file_name .format_map (format_map )
96
94
# Assemble URL to download the embeddings form
97
- url = (self . url_base . format_map ( format_map ) +
98
- self .file_name .format_map (format_map ) +
99
- self .zip_extension )
95
+ url = (
96
+ self . url_base . format_map ( format_map ) + self .file_name .format_map (format_map ) +
97
+ self .zip_extension )
100
98
101
99
super (BPEmb , self ).__init__ (name , url = url , ** kwargs )
0 commit comments