From e98e97d4839418337f6c48a56acd89d80cbf191a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Jul 2014 18:35:55 +0200 Subject: [PATCH] * Possessive test passing --- spacy/en.pyx | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index cafcc7795..00f8cab04 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -155,11 +155,22 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed, cdef size_t _find_split(unicode word, size_t length): cdef size_t i = 0 - if word[0].isalnum(): - while i < length and word[i].isalnum(): + if not is_punct(word, 0, length): + while i < length and not is_punct(word, i, length): i += 1 else: # Split off a punctuation character, or a sequence of the same punctuation character - while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]): + while i < length and is_punct(word, i, length) and (i == 0 or word[i-1] == word[i]): i += 1 return i + +cdef bint is_punct(unicode word, size_t i, size_t length): + if word[i] == "'": + if i >= (length - 1): + return True + elif word[i + 1] == 's' and i != 0: + return True + else: + return False + else: + return not word[i].isalnum()