mirror of https://github.com/explosion/spaCy.git
Fix #368: Tokenizer handled pattern 'unicode close quote, period' incorrectly.
This commit is contained in:
parent
ab952b4756
commit
737816e86e
|
@ -102,8 +102,8 @@ TOKENIZER_PREFIXES = r''', " ( [ { * < $ £ “ ' `` ` # US$ C$ A$ a- ‘ .... .
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ' ” '' 's 'S ’s ’S ’'''
|
TOKENIZER_SUFFIXES = (r''', \" \) \] \} \* \! \? % \$ > : ; ' ” '' 's 'S ’s ’S ’'''
|
||||||
'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]"'%\)])\. '''
|
r'''\.\. \.\.\. \.\.\.\. (?<=[a-z0-9)\]”"'%\)])\. '''
|
||||||
'''(?<=[0-9])km''').strip().split()
|
r'''(?<=[0-9])km''').strip().split()
|
||||||
|
|
||||||
|
|
||||||
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
TOKENIZER_INFIXES = (r'''\.\.\.+ (?<=[a-z])\.(?=[A-Z]) (?<=[a-zA-Z])-(?=[a-zA-z]) '''
|
||||||
|
|
Loading…
Reference in New Issue