mirror of https://github.com/explosion/spaCy.git
* Refactor tokenization, enable cache, and ensure we look up specials correctly even when there's confusing punctuation surrounding the token.
This commit is contained in:
parent
143e51ec73
commit
0152831c89
62
spacy/en.pyx
62
spacy/en.pyx
|
@ -56,67 +56,7 @@ cdef class English(Language):
|
||||||
name (unicode): The two letter code used by Wikipedia for the language.
|
name (unicode): The two letter code used by Wikipedia for the language.
|
||||||
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
lexicon (Lexicon): The lexicon. Exposes the lookup method.
|
||||||
"""
|
"""
|
||||||
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
pass
|
||||||
cdef Py_UNICODE c0 = chars[0]
|
|
||||||
cdef Py_UNICODE c1 = chars[1]
|
|
||||||
if c0 == ",":
|
|
||||||
return 1
|
|
||||||
elif c0 == '"':
|
|
||||||
return 1
|
|
||||||
elif c0 == "(":
|
|
||||||
return 1
|
|
||||||
elif c0 == "[":
|
|
||||||
return 1
|
|
||||||
elif c0 == "{":
|
|
||||||
return 1
|
|
||||||
elif c0 == "*":
|
|
||||||
return 1
|
|
||||||
elif c0 == "<":
|
|
||||||
return 1
|
|
||||||
elif c0 == "$":
|
|
||||||
return 1
|
|
||||||
elif c0 == "£":
|
|
||||||
return 1
|
|
||||||
elif c0 == "€":
|
|
||||||
return 1
|
|
||||||
elif c0 == "\u201c":
|
|
||||||
return 1
|
|
||||||
elif c0 == "'":
|
|
||||||
if c1 == "s":
|
|
||||||
return 2
|
|
||||||
elif c1 == "S":
|
|
||||||
return 2
|
|
||||||
elif c1 == "'":
|
|
||||||
return 2
|
|
||||||
else:
|
|
||||||
return 1
|
|
||||||
elif c0 == "`":
|
|
||||||
if c1 == "`":
|
|
||||||
return 2
|
|
||||||
else:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
abbreviations = set(['U.S', 'u.s', 'U.N', 'Ms', 'Mr', 'P'])
|
|
||||||
cdef bint _check_punct(Py_UNICODE* characters, size_t i, size_t length):
|
|
||||||
cdef unicode char_i = characters[i]
|
|
||||||
cdef unicode char_i1 = characters[i+1]
|
|
||||||
# Don't count appostrophes as punct if the next char is a letter
|
|
||||||
if characters[i] == "'" and i < (length - 1) and char_i1.isalpha():
|
|
||||||
return i == 0
|
|
||||||
if characters[i] == "-":
|
|
||||||
return False
|
|
||||||
#and i < (length - 1) and characters[i+1] == '-':
|
|
||||||
#return False
|
|
||||||
# Don't count commas as punct if the next char is a number
|
|
||||||
if characters[i] == "," and i < (length - 1) and char_i1.isdigit():
|
|
||||||
return False
|
|
||||||
if characters[i] == "." and i < (length - 1):
|
|
||||||
return False
|
|
||||||
if characters[i] == "." and characters[:i] in abbreviations:
|
|
||||||
return False
|
|
||||||
return not char_i.isalnum()
|
|
||||||
|
|
||||||
|
|
||||||
EN = English('en', [], [])
|
EN = English('en', [], [])
|
||||||
|
|
|
@ -25,7 +25,7 @@ cdef class Lexicon:
|
||||||
cpdef readonly size_t size
|
cpdef readonly size_t size
|
||||||
|
|
||||||
cpdef Lexeme lookup(self, unicode string)
|
cpdef Lexeme lookup(self, unicode string)
|
||||||
cdef LexemeC* get(self, String* s)
|
cdef LexemeC* get(self, String* s) except NULL
|
||||||
|
|
||||||
cdef PointerHash _dict
|
cdef PointerHash _dict
|
||||||
|
|
||||||
|
|
194
spacy/lang.pyx
194
spacy/lang.pyx
|
@ -185,7 +185,11 @@ cdef class Language:
|
||||||
if Py_UNICODE_ISSPACE(c) == 1:
|
if Py_UNICODE_ISSPACE(c) == 1:
|
||||||
if start < i:
|
if start < i:
|
||||||
string_from_slice(&span, chars, start, i)
|
string_from_slice(&span, chars, start, i)
|
||||||
self._tokenize(tokens.v, &span)
|
try:
|
||||||
|
self._tokenize(tokens.v, &span)
|
||||||
|
except MemoryError:
|
||||||
|
print chars[start:i]
|
||||||
|
raise
|
||||||
start = i + 1
|
start = i + 1
|
||||||
i += 1
|
i += 1
|
||||||
if start < i:
|
if start < i:
|
||||||
|
@ -194,28 +198,61 @@ cdef class Language:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
cdef int _tokenize(self, vector[LexemeC*] *tokens_v, String* string) except -1:
|
||||||
self._check_cache(tokens_v, string)
|
cdef size_t i
|
||||||
if not string.n:
|
lexemes = <LexemeC**>self.cache.get(string.key)
|
||||||
|
if lexemes != NULL:
|
||||||
|
i = 0
|
||||||
|
while lexemes[i] != NULL:
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
|
i += 1
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
cdef uint64_t orig_key = string.key
|
cdef uint64_t orig_key = string.key
|
||||||
cdef size_t orig_size = tokens_v.size()
|
cdef size_t orig_size = tokens_v.size()
|
||||||
|
|
||||||
cdef vector[LexemeC*] prefixes
|
cdef vector[LexemeC*] prefixes
|
||||||
cdef vector[LexemeC*] suffixes
|
cdef vector[LexemeC*] suffixes
|
||||||
|
|
||||||
cdef String affix
|
cdef String prefix
|
||||||
cdef int split = self._find_prefix(string.chars, string.n)
|
cdef String suffix
|
||||||
while string.n and split >= 1:
|
cdef String minus_pre
|
||||||
string_slice_prefix(string, &affix, split)
|
cdef String minus_suf
|
||||||
prefixes.push_back(self.lexicon.get(&affix))
|
cdef size_t last_size = 0
|
||||||
split = self._find_prefix(string.chars, string.n)
|
while string.n != 0 and string.n != last_size:
|
||||||
|
last_size = string.n
|
||||||
|
pre_len = self._find_prefix(string.chars, string.n)
|
||||||
|
if pre_len != 0:
|
||||||
|
string_from_slice(&prefix, string.chars, 0, pre_len)
|
||||||
|
string_from_slice(&minus_pre, string.chars, pre_len, string.n)
|
||||||
|
# Check whether we've hit a special-case
|
||||||
|
if minus_pre.n >= 1 and self.specials.get(minus_pre.key) != NULL:
|
||||||
|
string = &minus_pre
|
||||||
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
|
break
|
||||||
|
suf_len = self._find_suffix(string.chars, string.n)
|
||||||
|
if suf_len != 0:
|
||||||
|
string_from_slice(&suffix, string.chars, string.n - suf_len, string.n)
|
||||||
|
string_from_slice(&minus_suf, string.chars, 0, string.n - suf_len)
|
||||||
|
# Check whether we've hit a special-case
|
||||||
|
if minus_suf.n >= 1 and self.specials.get(minus_suf.key) != NULL:
|
||||||
|
string = &minus_suf
|
||||||
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
|
break
|
||||||
|
|
||||||
|
if pre_len and suf_len and (pre_len + suf_len) <= string.n:
|
||||||
|
string_from_slice(string, string.chars, pre_len, string.n - suf_len)
|
||||||
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
|
elif pre_len:
|
||||||
|
string = &minus_pre
|
||||||
|
prefixes.push_back(self.lexicon.get(&prefix))
|
||||||
|
elif suf_len:
|
||||||
|
string = &minus_suf
|
||||||
|
suffixes.push_back(self.lexicon.get(&suffix))
|
||||||
|
|
||||||
|
if self.specials.get(string.key):
|
||||||
|
break
|
||||||
|
|
||||||
split = self._find_suffix(string.chars, string.n)
|
|
||||||
while string.n and split >= 1:
|
|
||||||
string_slice_suffix(string, &affix, split)
|
|
||||||
suffixes.push_back(self.lexicon.get(&affix))
|
|
||||||
split = self._find_suffix(string.chars, string.n)
|
|
||||||
|
|
||||||
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
self._attach_tokens(tokens_v, string, &prefixes, &suffixes)
|
||||||
self._save_cached(tokens_v, orig_key, orig_size)
|
self._save_cached(tokens_v, orig_key, orig_size)
|
||||||
|
|
||||||
|
@ -230,16 +267,23 @@ cdef class Language:
|
||||||
string.key = 0
|
string.key = 0
|
||||||
string.chars = NULL
|
string.chars = NULL
|
||||||
|
|
||||||
|
|
||||||
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
cdef int _attach_tokens(self, vector[LexemeC*] *tokens, String* string,
|
||||||
vector[LexemeC*] *prefixes,
|
vector[LexemeC*] *prefixes,
|
||||||
vector[LexemeC*] *suffixes) except -1:
|
vector[LexemeC*] *suffixes) except -1:
|
||||||
|
cdef size_t i
|
||||||
|
cdef LexemeC** lexemes
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
for lexeme in prefixes[0]:
|
for lexeme in deref(prefixes):
|
||||||
tokens.push_back(lexeme)
|
tokens.push_back(lexeme)
|
||||||
self._check_cache(tokens, string)
|
|
||||||
if string.n != 0:
|
if string.n != 0:
|
||||||
tokens.push_back(self.lexicon.get(string))
|
lexemes = <LexemeC**>self.specials.get(string.key)
|
||||||
|
if lexemes != NULL:
|
||||||
|
i = 0
|
||||||
|
while lexemes[i] != NULL:
|
||||||
|
tokens.push_back(lexemes[i])
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
tokens.push_back(self.lexicon.get(string))
|
||||||
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
tokens.push_back(deref(it))
|
tokens.push_back(deref(it))
|
||||||
|
@ -247,22 +291,100 @@ cdef class Language:
|
||||||
|
|
||||||
cdef int _save_cached(self, vector[LexemeC*] *tokens,
|
cdef int _save_cached(self, vector[LexemeC*] *tokens,
|
||||||
uint64_t key, size_t n) except -1:
|
uint64_t key, size_t n) except -1:
|
||||||
pass
|
assert tokens.size() > n
|
||||||
|
lexemes = <LexemeC**>calloc((tokens.size() - n) + 1, sizeof(LexemeC**))
|
||||||
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length):
|
cdef size_t i, j
|
||||||
return 0
|
for i, j in enumerate(range(n, tokens.size())):
|
||||||
|
lexemes[i] = tokens.at(j)
|
||||||
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length):
|
lexemes[i + 1] = NULL
|
||||||
if length < 2:
|
self.cache.set(key, lexemes)
|
||||||
return 0
|
|
||||||
cdef unicode string = characters[:length]
|
cdef int _find_prefix(self, Py_UNICODE* chars, size_t length) except -1:
|
||||||
print repr(string)
|
cdef Py_UNICODE c0 = chars[0]
|
||||||
if string.endswith("'s") or string.endswith("'S"):
|
cdef Py_UNICODE c1 = chars[1]
|
||||||
return 2
|
if c0 == ",":
|
||||||
elif string.endswith("..."):
|
|
||||||
return 3
|
|
||||||
elif not string[-1].isalnum():
|
|
||||||
return 1
|
return 1
|
||||||
|
elif c0 == '"':
|
||||||
|
return 1
|
||||||
|
elif c0 == "(":
|
||||||
|
return 1
|
||||||
|
elif c0 == "[":
|
||||||
|
return 1
|
||||||
|
elif c0 == "{":
|
||||||
|
return 1
|
||||||
|
elif c0 == "*":
|
||||||
|
return 1
|
||||||
|
elif c0 == "<":
|
||||||
|
return 1
|
||||||
|
elif c0 == "$":
|
||||||
|
return 1
|
||||||
|
elif c0 == "£":
|
||||||
|
return 1
|
||||||
|
elif c0 == "€":
|
||||||
|
return 1
|
||||||
|
elif c0 == "\u201c":
|
||||||
|
return 1
|
||||||
|
elif c0 == "'":
|
||||||
|
return 1
|
||||||
|
elif c0 == "`":
|
||||||
|
if c1 == "`":
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cdef int _find_suffix(self, Py_UNICODE* chars, size_t length):
|
||||||
|
cdef Py_UNICODE c0 = chars[length - 1]
|
||||||
|
cdef Py_UNICODE c1 = chars[length - 2] if length >= 2 else 0
|
||||||
|
cdef Py_UNICODE c2 = chars[length - 3] if length >= 3 else 0
|
||||||
|
|
||||||
|
if c0 == ",":
|
||||||
|
return 1
|
||||||
|
elif c0 == '"':
|
||||||
|
return 1
|
||||||
|
elif c0 == ')':
|
||||||
|
return 1
|
||||||
|
elif c0 == ']':
|
||||||
|
return 1
|
||||||
|
elif c0 == '}':
|
||||||
|
return 1
|
||||||
|
elif c0 == '*':
|
||||||
|
return 1
|
||||||
|
elif c0 == '!':
|
||||||
|
return 1
|
||||||
|
elif c0 == '?':
|
||||||
|
return 1
|
||||||
|
elif c0 == '%':
|
||||||
|
return 1
|
||||||
|
elif c0 == '$':
|
||||||
|
return 1
|
||||||
|
elif c0 == '>':
|
||||||
|
return 1
|
||||||
|
elif c0 == ':':
|
||||||
|
return 1
|
||||||
|
elif c0 == "'":
|
||||||
|
return 1
|
||||||
|
elif c0 == u'\u201d':
|
||||||
|
return 1
|
||||||
|
elif c0 == "s":
|
||||||
|
if c1 == "'":
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
elif c0 == "S":
|
||||||
|
if c1 == "'":
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
elif c0 == ".":
|
||||||
|
if c1 == ".":
|
||||||
|
if c2 == ".":
|
||||||
|
return 3
|
||||||
|
else:
|
||||||
|
return 2
|
||||||
|
else:
|
||||||
|
return 1
|
||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
@ -316,7 +438,7 @@ cdef class Lexicon:
|
||||||
self._dict.set(string.key, lexeme)
|
self._dict.set(string.key, lexeme)
|
||||||
self.size += 1
|
self.size += 1
|
||||||
|
|
||||||
cdef LexemeC* get(self, String* string):
|
cdef LexemeC* get(self, String* string) except NULL:
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
lexeme = <LexemeC*>self._dict.get(string.key)
|
lexeme = <LexemeC*>self._dict.get(string.key)
|
||||||
if lexeme != NULL:
|
if lexeme != NULL:
|
||||||
|
@ -372,5 +494,3 @@ cdef inline void string_slice_suffix(String* s, String* suffix, size_t n) nogil:
|
||||||
string_from_slice(suffix, s.chars, s.n - n, s.n)
|
string_from_slice(suffix, s.chars, s.n - n, s.n)
|
||||||
s.n -= n
|
s.n -= n
|
||||||
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
s.key = hash64(s.chars, s.n * sizeof(Py_UNICODE), 0)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
from spacy.orth import is_punct
|
||||||
|
|
||||||
|
|
||||||
|
def test_comma():
|
||||||
|
assert is_punct(',', 0, {}, {}) == True
|
||||||
|
|
||||||
|
|
||||||
|
def test_space():
|
||||||
|
assert is_punct(' ', 0, {}, {}) == False
|
||||||
|
|
||||||
|
|
||||||
|
def test_letter():
|
||||||
|
assert is_punct('a', 0, {}, {}) == False
|
|
@ -0,0 +1,14 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.en import EN
|
||||||
|
|
||||||
|
def test_only_pre1():
|
||||||
|
assert len(EN.tokenize("(")) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_only_pre2():
|
||||||
|
assert len(EN.tokenize("((")) == 2
|
||||||
|
|
||||||
|
def test_only_suf2():
|
||||||
|
assert len(EN.tokenize("''")) == 2
|
|
@ -0,0 +1,45 @@
|
||||||
|
"""Test entries in the tokenization special-case interacting with prefix
|
||||||
|
and suffix punctuation."""
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.en import EN
|
||||||
|
|
||||||
|
def test_no_special():
|
||||||
|
assert len(EN.tokenize("(can)")) == 3
|
||||||
|
|
||||||
|
def test_no_punct():
|
||||||
|
assert len(EN.tokenize("can't")) == 2
|
||||||
|
|
||||||
|
def test_prefix():
|
||||||
|
assert len(EN.tokenize("(can't")) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_suffix():
|
||||||
|
assert len(EN.tokenize("can't)")) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_wrap():
|
||||||
|
assert len(EN.tokenize("(can't)")) == 4
|
||||||
|
|
||||||
|
|
||||||
|
def test_uneven_wrap():
|
||||||
|
assert len(EN.tokenize("(can't?)")) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefix_interact():
|
||||||
|
assert len(EN.tokenize("U.S.")) == 1
|
||||||
|
assert len(EN.tokenize("us.")) == 2
|
||||||
|
assert len(EN.tokenize("(U.S.")) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_suffix_interact():
|
||||||
|
assert len(EN.tokenize("U.S.)")) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_even_wrap_interact():
|
||||||
|
assert len(EN.tokenize("(U.S.)")) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_uneven_wrap_interact():
|
||||||
|
assert len(EN.tokenize("(U.S.?)")) == 4
|
Loading…
Reference in New Issue