From 98cfd84123b21859e7087c7551df5946fde57126 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 6 Jun 2015 05:57:03 +0200 Subject: [PATCH] * Remove hyphenation from main tokenizer loop: do it in infix.txt instead. This lets emoticons work --- spacy/tokenizer.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 26aa7f0fa..7a1231a07 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -76,9 +76,7 @@ cdef class Tokenizer: cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0]) cdef UniStr span for i in range(1, length): - # TODO: Allow control of hyphenation - if (Py_UNICODE_ISSPACE(chars[i]) or chars[i] == '-') != in_ws: - #if Py_UNICODE_ISSPACE(chars[i]) != in_ws: + if Py_UNICODE_ISSPACE(chars[i]) != in_ws: if start < i: slice_unicode(&span, chars, start, i) cache_hit = self._try_cache(start, span.key, tokens)