mirror of https://github.com/explosion/spaCy.git
* Possessive test passing
This commit is contained in:
parent
556f6a18ca
commit
e98e97d483
17
spacy/en.pyx
17
spacy/en.pyx
|
@ -155,11 +155,22 @@ cdef Lexeme* _init_lexeme(unicode string, StringHash hashed,
|
|||
|
||||
cdef size_t _find_split(unicode word, size_t length):
|
||||
cdef size_t i = 0
|
||||
if word[0].isalnum():
|
||||
while i < length and word[i].isalnum():
|
||||
if not is_punct(word, 0, length):
|
||||
while i < length and not is_punct(word, i, length):
|
||||
i += 1
|
||||
else:
|
||||
# Split off a punctuation character, or a sequence of the same punctuation character
|
||||
while i < length and not word[i].isalnum() and (i == 0 or word[i-1] == word[i]):
|
||||
while i < length and is_punct(word, i, length) and (i == 0 or word[i-1] == word[i]):
|
||||
i += 1
|
||||
return i
|
||||
|
||||
cdef bint is_punct(unicode word, size_t i, size_t length):
|
||||
if word[i] == "'":
|
||||
if i >= (length - 1):
|
||||
return True
|
||||
elif word[i + 1] == 's' and i != 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
else:
|
||||
return not word[i].isalnum()
|
||||
|
|
Loading…
Reference in New Issue