mirror of https://github.com/explosion/spaCy.git
* Recognise multiple infixes in a token.
This commit is contained in:
parent
6df3858dbc
commit
04d0209be9
|
@ -43,7 +43,7 @@ def test_double_hyphen(en_tokenizer):
|
||||||
assert tokens[6].text == u'-'
|
assert tokens[6].text == u'-'
|
||||||
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
|
# TODO: This points to a deeper issue with the tokenizer: it doesn't re-enter
|
||||||
# on infixes.
|
# on infixes.
|
||||||
#assert tokens[7].text == u'bred'
|
assert tokens[7].text == u'bred'
|
||||||
#assert tokens[8].text == u'--'
|
assert tokens[8].text == u'--'
|
||||||
#assert tokens[9].text == u'people'
|
assert tokens[9].text == u'people'
|
||||||
|
|
||||||
|
|
|
@ -28,8 +28,9 @@ cdef class Tokenizer:
|
||||||
self._suffix_re = suffix_re
|
self._suffix_re = suffix_re
|
||||||
self._infix_re = infix_re
|
self._infix_re = infix_re
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._load_special_tokenization(rules)
|
self._rules = {}
|
||||||
self._rules = rules
|
for chunk, substrings in sorted(rules.items()):
|
||||||
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
def __reduce__(self):
|
def __reduce__(self):
|
||||||
args = (self.vocab,
|
args = (self.vocab,
|
||||||
|
@ -158,7 +159,8 @@ cdef class Tokenizer:
|
||||||
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
self._attach_tokens(tokens, span, &prefixes, &suffixes)
|
||||||
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
|
||||||
|
|
||||||
cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes,
|
cdef unicode _split_affixes(self, Pool mem, unicode string,
|
||||||
|
vector[const LexemeC*] *prefixes,
|
||||||
vector[const LexemeC*] *suffixes):
|
vector[const LexemeC*] *suffixes):
|
||||||
cdef size_t i
|
cdef size_t i
|
||||||
cdef unicode prefix
|
cdef unicode prefix
|
||||||
|
@ -215,20 +217,23 @@ cdef class Tokenizer:
|
||||||
if string:
|
if string:
|
||||||
cache_hit = self._try_cache(hash_string(string), tokens)
|
cache_hit = self._try_cache(hash_string(string), tokens)
|
||||||
if not cache_hit:
|
if not cache_hit:
|
||||||
match = self.find_infix(string)
|
matches = self.find_infix(string)
|
||||||
if match is None:
|
if not matches:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
tokens.push_back(self.vocab.get(tokens.mem, string), False)
|
||||||
else:
|
else:
|
||||||
split = match.start()
|
# let's say we have dyn-o-mite-dave
|
||||||
end = match.end()
|
# the regex finds the start and end positions of the hyphens
|
||||||
# Append the beginning, affix, end of the infix span
|
start = 0
|
||||||
span = string[:split]
|
for match in matches:
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
infix_start = match.start()
|
||||||
|
infix_end = match.end()
|
||||||
|
span = string[start:infix_start]
|
||||||
|
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||||
|
|
||||||
span = string[split:end]
|
infix_span = string[infix_start:infix_end]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
|
||||||
|
start = infix_end
|
||||||
span = string[end:]
|
span = string[start:]
|
||||||
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
tokens.push_back(self.vocab.get(tokens.mem, span), False)
|
||||||
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
|
||||||
while it != suffixes.rend():
|
while it != suffixes.rend():
|
||||||
|
@ -251,7 +256,7 @@ cdef class Tokenizer:
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, unicode string):
|
||||||
return self._infix_re.search(string)
|
return list(self._infix_re.finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
match = self._prefix_re.search(string)
|
match = self._prefix_re.search(string)
|
||||||
|
@ -262,21 +267,24 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, special_cases):
|
def _load_special_tokenization(self, special_cases):
|
||||||
'''Add a special-case tokenization rule.
|
'''Add special-case tokenization rules.
|
||||||
'''
|
'''
|
||||||
cdef int i
|
|
||||||
cdef list substrings
|
|
||||||
cdef unicode chunk
|
|
||||||
cdef unicode form
|
|
||||||
cdef unicode lemma
|
|
||||||
cdef dict props
|
|
||||||
cdef LexemeC** lexemes
|
|
||||||
cdef hash_t hashed
|
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
self.add_special_case(chunk, substrings)
|
||||||
cached.length = len(substrings)
|
|
||||||
cached.is_lex = False
|
def add_special_case(self, unicode chunk, substrings):
|
||||||
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
'''Add a special-case tokenization rule.
|
||||||
key = hash_string(chunk)
|
|
||||||
self._specials.set(key, cached)
|
For instance, "don't" is special-cased to tokenize into
|
||||||
self._cache.set(key, cached)
|
["do", "n't"]. The split tokens can have lemmas and part-of-speech
|
||||||
|
tags.
|
||||||
|
'''
|
||||||
|
substrings = list(substrings)
|
||||||
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
|
cached.length = len(substrings)
|
||||||
|
cached.is_lex = False
|
||||||
|
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||||
|
key = hash_string(chunk)
|
||||||
|
self._specials.set(key, cached)
|
||||||
|
self._cache.set(key, cached)
|
||||||
|
self._rules[chunk] = substrings
|
||||||
|
|
Loading…
Reference in New Issue