* Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work

This commit is contained in:
Matthew Honnibal 2015-06-06 05:57:03 +02:00
parent 45ec92243a
commit 98cfd84123
1 changed files with 1 additions and 3 deletions

View File

@ -76,9 +76,7 @@ cdef class Tokenizer:
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
cdef UniStr span cdef UniStr span
for i in range(1, length): for i in range(1, length):
# TODO: Allow control of hyphenation if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if (Py_UNICODE_ISSPACE(chars[i]) or chars[i] == '-') != in_ws:
#if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
if start < i: if start < i:
slice_unicode(&span, chars, start, i) slice_unicode(&span, chars, start, i)
cache_hit = self._try_cache(start, span.key, tokens) cache_hit = self._try_cache(start, span.key, tokens)