* Fixes to tokenization. Now segment sequences of the same punctuation.

2014-07-06 19:28:42 +02:00 · 2014-07-06 19:28:42 +02:00 · 72159e7011
parent e98e97d483
commit 72159e7011
1 changed files with 15 additions and 15 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -154,23 +154,23 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
 cdef size_t _find_split(unicode word, size_t length):
-    cdef size_t i = 0
+    cdef int i = 0
-    if not is_punct(word, 0, length):
+    # Contractions
-        while i < length and not is_punct(word, i, length):
+    if word == "'s":
-            i += 1
+        return 2
    # Leading punctuation
    if is_punct(word, 0, length):
        return 1
    elif length >= 1 and is_punct(word, length - 1, length):
        # Split off all trailing punctuation characters
        i = length - 1
        while i >= 2 and is_punct(word, i-1, length):
            i -= 1
    else:
-        # Split off a punctuation character, or a sequence of the same punctuation character
+        # Doesn't start or end with the punct
-        while i < length and is_punct(word, i, length) and (i == 0 or word[i-1] == word[i]):
+        while i < length and not is_punct(word, i, length):
            i += 1
    return i
 cdef bint is_punct(unicode word, size_t i, size_t length):
-    if word[i] == "'":
+    return not word[i].isalnum()
        if i >= (length - 1):
            return True
        elif word[i + 1] == 's' and i != 0:
            return True
        else:
            return False
    else:
        return not word[i].isalnum()