Fix #368: Tokenizer handled pattern 'unicode close quote, period' incorrectly.

This commit is contained in:
Matthew Honnibal 2016-11-04 15:16:20 +01:00
parent ab952b4756
commit 737816e86e
1 changed files with 2 additions and 2 deletions

View File

@ -102,8 +102,8 @@ TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- .... .
TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ''' 's 'S s S '''
'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]"'%\)])\. '''
'''(?<=[0-9])km''').strip().split()
r'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]"'%\)])\. '''
r'''(?<=[0-9])km''').strip().split()
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''