* Possessive test passing

This commit is contained in:
Matthew Honnibal 2014-07-06 18:35:55 +02:00
parent 556f6a18ca
commit e98e97d483
1 changed files with 14 additions and 3 deletions

View File

@ -155,11 +155,22 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
cdef size_t _find_split(unicode word, size_t length):
cdef size_t i = 0
if word[0].isalnum():
while i < length and word[i].isalnum():
if not is_punct(word, 0, length):
while i < length and not is_punct(word, i, length):
i += 1
else:
# Split off a punctuation character, or a sequence of the same punctuation character
while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
while i < length and is_punct(word, i, length) and (i == 0 or word[i-1] == word[i]):
i += 1
return i
cdef bint is_punct(unicode word, size_t i, size_t length):
if word[i] == "'":
if i >= (length - 1):
return True
elif word[i + 1] == 's' and i != 0:
return True
else:
return False
else:
return not word[i].isalnum()