Skip to content

Commit d615ac4

Browse files
committed
workaround of Kimi tokenizer: use something like Qwen.
1 parent bed565d commit d615ac4

File tree

1 file changed

+10
-6
lines changed

1 file changed

+10
-6
lines changed

models/kimi.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,9 @@ namespace vl
503503
public:
504504
Tokenizer(const Config &config)
505505
: Tokenizer(config, &_chat_encoder)
506-
{}
506+
{
507+
auto_add_bos = false;
508+
}
507509

508510
Tokenizer(const Config &config, BaseHistoryEncoder *chat_encoder)
509511
: BaseTokenizer(config, chat_encoder, nullptr, nullptr)
@@ -515,17 +517,19 @@ namespace vl
515517
{
516518
tp = new tokenizer::BPEProcessor2(
517519
{
518-
"[\\p{Han}]+",
519520
// FIXME: support &&
521+
// "[\\p{Han}]+",
520522
//"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
521-
//"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}&&[^\\p{Han}]]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
522-
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
523-
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?",
523+
//"[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
524+
525+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])",
526+
"[^\\r\\n\\p{L}\\p{N}]?\\p{L}+",
527+
524528
"\\p{N}{1,3}",
525529
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
526530
"\\s*[\\r\\n]+",
527531
"\\s+(?!\\S)",
528-
"\\+",
532+
//"\\s+",
529533
}
530534
);
531535
size_t size = tp->Load(buffer, n_vocab);

0 commit comments

Comments
 (0)