mirror of https://github.com/explosion/spaCy.git
* Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work
This commit is contained in:
parent
45ec92243a
commit
98cfd84123
|
@ -76,9 +76,7 @@ cdef class Tokenizer:
|
||||||
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
|
||||||
cdef UniStr span
|
cdef UniStr span
|
||||||
for i in range(1, length):
|
for i in range(1, length):
|
||||||
# TODO: Allow control of hyphenation
|
if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
||||||
if (Py_UNICODE_ISSPACE(chars[i]) or chars[i] == '-') != in_ws:
|
|
||||||
#if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
|
|
||||||
if start < i:
|
if start < i:
|
||||||
slice_unicode(&span, chars, start, i)
|
slice_unicode(&span, chars, start, i)
|
||||||
cache_hit = self._try_cache(start, span.key, tokens)
|
cache_hit = self._try_cache(start, span.key, tokens)
|
||||||
|
|
Loading…
Reference in New Issue