@@ -503,7 +503,9 @@ namespace vl
503
503
public:
504
504
Tokenizer (const Config &config)
505
505
: Tokenizer(config, &_chat_encoder)
506
- {}
506
+ {
507
+ auto_add_bos = false ;
508
+ }
507
509
508
510
Tokenizer (const Config &config, BaseHistoryEncoder *chat_encoder)
509
511
: BaseTokenizer(config, chat_encoder, nullptr , nullptr )
@@ -515,17 +517,19 @@ namespace vl
515
517
{
516
518
tp = new tokenizer::BPEProcessor2 (
517
519
{
518
- " [\\ p{Han}]+" ,
519
520
// FIXME: support &&
521
+ // "[\\p{Han}]+",
520
522
// "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
521
- // "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
522
- " [^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
523
- " [^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?" ,
523
+ // "[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
524
+
525
+ " (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])" ,
526
+ " [^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+" ,
527
+
524
528
" \\ p{N}{1,3}" ,
525
529
" ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*" ,
526
530
" \\ s*[\\ r\\ n]+" ,
527
531
" \\ s+(?!\\ S)" ,
528
- " \\ +" ,
532
+ // "\\s +",
529
533
}
530
534
);
531
535
size_t size = tp->Load (buffer, n_vocab);
0 commit comments