17
17
import re
18
18
import sys
19
19
20
-
21
20
if sys.version_info[0 ] > 2 :
22
21
from itertools import zip
23
22
from urllib.parse import quote as urllib_quote
@@ -30,24 +29,29 @@ else:
30
29
31
30
# <http://github.com/mzsanford/twitter-text-java>
32
31
33
- AT_SIGNS = ur ' [@ \uff20 ]'
34
- UTF_CHARS = ur ' a-z0-9_\u00c0 -\u00d6\u00d8 -\u00f6\u00f8 -\u00ff '
35
- SPACES = ur ' [\u0020\u00A0\u1680\u180E\u2002 - \u202F\u205F\u2060\u3000 ]'
32
+ AT_SIGNS = br ' [@ \uff20 ]' .decode( ' raw_unicode_escape ' )
33
+ UTF_CHARS = br ' a-z0-9_\u00c0 -\u00d6\u00d8 -\u00f6\u00f8 -\u00ff ' .decode( ' raw_unicode_escape ' )
34
+ SPACES = br ' [\u0020\u00A0\u1680\u180E\u2002 - \u202F\u205F\u2060\u3000 ]' .decode( ' raw_unicode_escape ' )
36
35
37
- LIST_PRE_CHARS = ur ' ( [^ a-z0-9_ ]| ^ ) '
38
- LIST_END_CHARS = ur ' ( [a-z0-9_ ]{1,20} ) ( /[a-z ][a-z0-9 \x80 - \xFF - ]{0,79} ) ? '
36
+ LIST_PRE_CHARS = br ' ( [^ a-z0-9_ ]| ^ ) ' .decode( ' raw_unicode_escape ' )
37
+ LIST_END_CHARS = br ' ( [a-z0-9_ ]{1,20} ) ( /[a-z ][a-z0-9 \x80 - \xFF - ]{0,79} ) ? ' .decode( ' raw_unicode_escape ' )
39
38
LIST_REGEX = re.compile(LIST_PRE_CHARS + ' (' + AT_SIGNS + ' +)' + LIST_END_CHARS , re.IGNORECASE )
40
39
41
- USERNAME_REGEX = re.compile(ur ' \B ' + AT_SIGNS + LIST_END_CHARS , re.IGNORECASE )
42
- REPLY_REGEX = re.compile(ur ' ^ (?: ' + SPACES + ur ' )* ' + AT_SIGNS + ur ' ( [a-z0-9_ ]{1,20} ) . * ' , re.IGNORECASE )
40
+ USERNAME_REGEX = re.compile(br ' \B ' .decode(' raw_unicode_escape' ) + AT_SIGNS + LIST_END_CHARS , re.IGNORECASE )
41
+ REPLY_REGEX = re.compile(
42
+ br ' ^ (?: ' .decode(' raw_unicode_escape' ) +
43
+ SPACES + br ' )* ' .decode(' raw_unicode_escape' ) +
44
+ AT_SIGNS + br ' ( [a-z0-9_ ]{1,20} ) . * ' .decode(' raw_unicode_escape' ),
45
+ re.IGNORECASE
46
+ )
43
47
44
- HASHTAG_EXP = ur ' ( ^ | [^ 0-9A-Z&/ ]+ ) ( #| \uff03 ) ( [0-9A-Z_ ]* [A-Z_ ]+ [%s ]* ) ' % UTF_CHARS
48
+ HASHTAG_EXP = br ' ( ^ | [^ 0-9A-Z&/ ]+ ) ( #| \uff03 ) ( [0-9A-Z_ ]* [A-Z_ ]+ [%s ]* ) ' .decode( ' raw_unicode_escape ' ) % UTF_CHARS
45
49
HASHTAG_REGEX = re.compile(HASHTAG_EXP , re.IGNORECASE )
46
50
47
- PRE_CHARS = ur ' (?: [^ /" \' :!= ]| ^ | \: ) '
48
- DOMAIN_CHARS = ur ' ( [\. - ]| [^ \s _ \!\.\/ ]) + \. [a-z ]{2,} (?: :[0-9 ]+ ) ? '
49
- PATH_CHARS = ur ' (?: [\. , ]? [%s! \*\'\(\) ;:= \+\$ /%s# \[\]\- _,~@ ]) ' % (UTF_CHARS , ' %' )
50
- QUERY_CHARS = ur ' [a-z0-9! \*\'\(\) ;:&= \+\$ /%# \[\]\- _ \. ,~ ]'
51
+ PRE_CHARS = br ' (?: [^ /" \' :!= ]| ^ | \: ) ' .decode( ' raw_unicode_escape ' )
52
+ DOMAIN_CHARS = br ' ( [\. - ]| [^ \s _ \!\.\/ ]) + \. [a-z ]{2,} (?: :[0-9 ]+ ) ? ' .decode( ' raw_unicode_escape ' )
53
+ PATH_CHARS = br ' (?: [\. , ]? [%s! \*\'\(\) ;:= \+\$ /%s# \[\]\- _,~@ ]) ' .decode( ' raw_unicode_escape ' ) % (UTF_CHARS , ' %' )
54
+ QUERY_CHARS = br ' [a-z0-9! \*\'\(\) ;:&= \+\$ /%# \[\]\- _ \. ,~ ]' .decode( ' raw_unicode_escape ' )
51
55
52
56
PATH_ENDING_CHARS = r ' [%s \) =#/ ]' % UTF_CHARS
53
57
QUERY_ENDING_CHARS = ' [a-z0-9_&=#]'
0 commit comments