mirror of https://github.com/explosion/spaCy.git
* Fix bug with trailing punct on contractions. Reduced efficiency, and slightly hacky implementation.
This commit is contained in:
parent
7eab281194
commit
985bc68327
|
@ -40,3 +40,5 @@ cdef class EnglishTokens(Tokens):
|
|||
|
||||
cdef class English(Language):
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||
|
||||
|
||||
|
|
|
@ -236,7 +236,7 @@ cdef class English(Language):
|
|||
fl_is_digit = Flag_IsDigit
|
||||
v_shape = View_WordShape
|
||||
def __cinit__(self, name, user_string_features, user_flag_features):
|
||||
self.cache.set_empty_key(0)
|
||||
self.cache = {}
|
||||
lang_data = util.read_lang_data(name)
|
||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||
self.lexicon = lang.Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||
|
|
|
@ -9,6 +9,11 @@ from libcpp.vector cimport vector
|
|||
from libc.stdint cimport uint64_t, int64_t
|
||||
|
||||
|
||||
cdef extern from "Python.h":
|
||||
cdef bint Py_UNICODE_ISSPACE(Py_UNICODE ch)
|
||||
cdef bint Py_UNICODE_ISALNUM(Py_UNICODE ch)
|
||||
|
||||
|
||||
cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
||||
cdef cppclass dense_hash_map[K, D]:
|
||||
K& key_type
|
||||
|
@ -52,10 +57,6 @@ cdef extern from "sparsehash/dense_hash_map" namespace "google":
|
|||
D& operator[](K&) nogil
|
||||
|
||||
|
||||
cdef struct LexList:
|
||||
LexemeC* lex
|
||||
LexList* tail
|
||||
|
||||
cdef class Lexicon:
|
||||
cpdef readonly size_t size
|
||||
|
||||
|
@ -70,13 +71,12 @@ cdef class Lexicon:
|
|||
|
||||
cdef class Language:
|
||||
cdef unicode name
|
||||
cdef dense_hash_map[uint64_t, size_t] cache
|
||||
cdef size_t cache_size
|
||||
cdef dict cache
|
||||
cpdef readonly Lexicon lexicon
|
||||
cpdef readonly object tokens_class
|
||||
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
cpdef Lexeme lookup(self, unicode text)
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length)
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length)
|
||||
|
|
|
@ -40,8 +40,7 @@ cdef class Language:
|
|||
if string_features is None:
|
||||
string_features = []
|
||||
self.name = name
|
||||
self.cache.set_empty_key(0)
|
||||
self.cache_size = 0
|
||||
self.cache = {}
|
||||
lang_data = read_lang_data(name)
|
||||
rules, words, probs, clusters, case_stats, tag_stats = lang_data
|
||||
self.lexicon = Lexicon(words, probs, clusters, case_stats, tag_stats,
|
||||
|
@ -80,7 +79,6 @@ cdef class Language:
|
|||
Returns:
|
||||
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||
"""
|
||||
print repr(string)
|
||||
cdef size_t length = len(string)
|
||||
cdef Tokens tokens = self.tokens_class(length)
|
||||
if length == 0:
|
||||
|
@ -92,7 +90,7 @@ cdef class Language:
|
|||
cdef Py_UNICODE c
|
||||
for i in range(length):
|
||||
c = characters[i]
|
||||
if c == ' ' or c == '\n' or c == '\t':
|
||||
if Py_UNICODE_ISSPACE(c) == 1:
|
||||
if start < i:
|
||||
self._tokenize(tokens, &characters[start], i - start)
|
||||
start = i + 1
|
||||
|
@ -101,38 +99,30 @@ cdef class Language:
|
|||
self._tokenize(tokens, &characters[start], i - start)
|
||||
return tokens
|
||||
|
||||
cdef int _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length) except -1:
|
||||
cdef _tokenize(self, Tokens tokens, Py_UNICODE* characters, size_t length):
|
||||
cdef list lexemes
|
||||
cdef size_t lex_addr
|
||||
cdef uint64_t hashed = hash64(characters, length * sizeof(Py_UNICODE), 0)
|
||||
cdef LexList* node = <LexList*>self.cache[hashed]
|
||||
if node is not NULL:
|
||||
while node != NULL:
|
||||
tokens.push_back(node.lex)
|
||||
node = node.tail
|
||||
if hashed in self.cache:
|
||||
for lex_addr in self.cache[hashed]:
|
||||
tokens.push_back(<LexemeC*>lex_addr)
|
||||
return 0
|
||||
|
||||
node = <LexList*>calloc(1, sizeof(LexList))
|
||||
self.cache[hashed] = <size_t>node
|
||||
|
||||
lexemes = []
|
||||
cdef size_t start = 0
|
||||
cdef size_t split = 0
|
||||
while start < length:
|
||||
split = self._split_one(&characters[start], length - start)
|
||||
node.lex = <LexemeC*>self.lexicon.get(&characters[start], split)
|
||||
tokens.push_back(node.lex)
|
||||
start += split
|
||||
if start >= length:
|
||||
break
|
||||
hashed = hash64(&characters[start], (length - start) * sizeof(Py_UNICODE), 0)
|
||||
node.tail = <LexList*>self.cache[hashed]
|
||||
if node.tail == NULL:
|
||||
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
||||
self.cache[hashed] = <size_t>node.tail
|
||||
node = node.tail
|
||||
hashed = hash64(&characters[start], split * sizeof(Py_UNICODE), 0)
|
||||
if hashed in self.cache:
|
||||
lexemes.extend(self.cache[hashed])
|
||||
else:
|
||||
node = node.tail
|
||||
while node != NULL:
|
||||
tokens.push_back(node.lex)
|
||||
node = node.tail
|
||||
break
|
||||
lexeme = <LexemeC*>self.lexicon.get(&characters[start], split)
|
||||
lexemes.append(<size_t>lexeme)
|
||||
start += split
|
||||
for lex_addr in lexemes:
|
||||
tokens.push_back(<LexemeC*>lex_addr)
|
||||
#self.cache[hashed] = lexemes
|
||||
|
||||
cdef int _split_one(self, Py_UNICODE* characters, size_t length):
|
||||
return length
|
||||
|
@ -149,17 +139,14 @@ cdef class Language:
|
|||
token_rules (list): A list of (chunk, tokens) pairs, where chunk is
|
||||
a string and tokens is a list of strings.
|
||||
'''
|
||||
cdef LexList* node
|
||||
cdef list lexemes
|
||||
cdef uint64_t hashed
|
||||
for string, substrings in token_rules:
|
||||
hashed = hash64(<Py_UNICODE*>string, len(string) * sizeof(Py_UNICODE), 0)
|
||||
node = <LexList*>calloc(1, sizeof(LexList))
|
||||
self.cache[hashed] = <size_t>node
|
||||
for substring in substrings[:-1]:
|
||||
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substring, len(substring))
|
||||
node.tail = <LexList*>calloc(1, sizeof(LexList))
|
||||
node = node.tail
|
||||
node.lex = <LexemeC*>self.lexicon.get(<Py_UNICODE*>substrings[-1], len(substrings[-1]))
|
||||
lexemes = []
|
||||
for substring in substrings:
|
||||
lexemes.append(self.lexicon.get(<Py_UNICODE*>substring, len(substring)))
|
||||
self.cache[hashed] = lexemes
|
||||
|
||||
|
||||
cdef class Lexicon:
|
||||
|
|
|
@ -26,6 +26,8 @@ def test_punct():
|
|||
|
||||
def test_digits():
|
||||
lex_ids = EN.tokenize('The year: 1984.')
|
||||
assert lex_ids.string(4) == "."
|
||||
assert lex_ids.string(3) == "1984"
|
||||
assert len(lex_ids) == 5
|
||||
assert lex_ids[0].string == EN.lexicon.lookup('The').string
|
||||
assert lex_ids[3].string == EN.lexicon.lookup('1984').string
|
||||
|
@ -37,5 +39,17 @@ def test_contraction():
|
|||
assert len(lex_ids) == 3
|
||||
assert lex_ids[1].string == EN.lexicon.lookup("not").string
|
||||
lex_ids = EN.tokenize("i said don't!")
|
||||
assert len(lex_ids) == 4
|
||||
assert lex_ids[3].string == EN.lexicon.lookup('!').string
|
||||
assert len(lex_ids) == 5
|
||||
assert lex_ids[4].string == EN.lexicon.lookup('!').string
|
||||
|
||||
|
||||
def test_contraction_punct():
|
||||
tokens = EN.tokenize("(can't")
|
||||
assert len(tokens) == 3
|
||||
tokens = EN.tokenize("`ain't")
|
||||
assert len(tokens) == 3
|
||||
tokens = EN.tokenize('''"isn't''')
|
||||
assert len(tokens) == 3
|
||||
tokens = EN.tokenize("can't!")
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
|
Loading…
Reference in New Issue