mirror of https://github.com/explosion/spaCy.git
* Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work
This commit is contained in:
parent
45ec92243a
commit
98cfd84123
|
@ -76,9 +76,7 @@ cdef class Tokenizer:
|
|||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||
cdef UniStr span
|
||||
for i in range(1, length):
|
||||
# TODO: Allow control of hyphenation
|
||||
if (Py_UNICODE_ISSPACE(chars[i]) or chars[i] == '-') != in_ws:
|
||||
#if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||
if start < i:
|
||||
slice_unicode(&span, chars, start, i)
|
||||
cache_hit = self._try_cache(start, span.key, tokens)
|
||||
|
|
Loading…
Reference in New Issue