Skip to content

Commit 8cf36f3

Browse files
committed
allow hyphens and single quotes between words
1 parent 15ab548 commit 8cf36f3

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

whisper/tokenizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -245,9 +245,7 @@ def non_speech_tokens(self) -> Tuple[int]:
245245
246246
keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
247247
"""
248-
249-
result = set()
250-
symbols = list("'\"#()*+-/:;<=>@[\\]^_`{|}~「」『』")
248+
symbols = list("\"#()*+/:;<=>@[\\]^_`{|}~「」『』")
251249
symbols += "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
252250

253251
# symbols that may be a single token or multiple tokens depending on the tokenizer.
@@ -257,6 +255,8 @@ def non_speech_tokens(self) -> Tuple[int]:
257255
miscellaneous = set("♩♪♫♬♭♮♯")
258256
assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
259257

258+
# allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
259+
result = {self.tokenizer.encode(" -")[0], self.tokenizer.encode(" '")[0]}
260260
for symbol in symbols + list(miscellaneous):
261261
for tokens in [self.tokenizer.encode(symbol), self.tokenizer.encode(" " + symbol)]:
262262
if len(tokens) == 1 or symbol in miscellaneous:

0 commit comments

Comments
 (0)