mirror of https://github.com/explosion/spaCy.git
Fix #368: Tokenizer handled pattern 'unicode close quote, period' incorrectly.
This commit is contained in:
parent
ab952b4756
commit
737816e86e
|
@ -102,8 +102,8 @@ TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... .
|
|||
|
||||
|
||||
TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ' ” '' 's 'S ’s ’S ’'''
|
||||
'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]"'%\)])\. '''
|
||||
'''(?<=[0-9])km''').strip().split()
|
||||
r'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]”"'%\)])\. '''
|
||||
r'''(?<=[0-9])km''').strip().split()
|
||||
|
||||
|
||||
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
||||
|
|
Loading…
Reference in New Issue