mirror of https://github.com/explosion/spaCy.git
* Hack out morphology stuff from tokenizer, while morphology being reimplemented.
This commit is contained in:
parent
b4faf551f5
commit
119c0f8c3f
|
@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .structs cimport LexemeC, TokenC, Morphology
|
from .structs cimport LexemeC, TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .vocab cimport Vocab, _Cached
|
from .vocab cimport Vocab, _Cached
|
||||||
|
|
|
@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
|
|
||||||
from .morphology cimport set_morph_from_dict
|
|
||||||
from .strings cimport hash_string
|
from .strings cimport hash_string
|
||||||
cimport cython
|
cimport cython
|
||||||
|
|
||||||
|
@ -29,7 +28,7 @@ cdef class Tokenizer:
|
||||||
self._suffix_re = suffix_re
|
self._suffix_re = suffix_re
|
||||||
self._infix_re = infix_re
|
self._infix_re = infix_re
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self._load_special_tokenization(rules, self.vocab.pos_tags)
|
self._load_special_tokenization(rules)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, Vocab vocab, data_dir):
|
def from_dir(cls, Vocab vocab, data_dir):
|
||||||
|
@ -242,7 +241,7 @@ cdef class Tokenizer:
|
||||||
match = self._suffix_re.search(string)
|
match = self._suffix_re.search(string)
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, object rules, object tag_map):
|
def _load_special_tokenization(self, special_cases):
|
||||||
'''Add a special-case tokenization rule.
|
'''Add a special-case tokenization rule.
|
||||||
'''
|
'''
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -253,25 +252,15 @@ cdef class Tokenizer:
|
||||||
cdef dict props
|
cdef dict props
|
||||||
cdef LexemeC** lexemes
|
cdef LexemeC** lexemes
|
||||||
cdef hash_t hashed
|
cdef hash_t hashed
|
||||||
for chunk, substrings in sorted(rules.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||||
for i, props in enumerate(substrings):
|
for i, props in enumerate(substrings):
|
||||||
form = props['F']
|
form = props['F']
|
||||||
lemma = props.get("L", None)
|
|
||||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
||||||
if lemma is not None:
|
lemma = props.get('L', form)
|
||||||
tokens[i].lemma = self.vocab.strings[lemma]
|
tokens[i].lemma = self.vocab.strings[lemma]
|
||||||
else:
|
#TODO
|
||||||
tokens[i].lemma = 0
|
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
|
||||||
if 'pos' in props:
|
|
||||||
tokens[i].tag = self.vocab.strings[props['pos']]
|
|
||||||
tokens[i].pos = tag_map[props['pos']][0]
|
|
||||||
# These are defaults, which can be over-ridden by the
|
|
||||||
# token-specific props.
|
|
||||||
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
|
|
||||||
if tokens[i].lemma == 0:
|
|
||||||
tokens[i].lemma = tokens[i].lex.orth
|
|
||||||
set_morph_from_dict(&tokens[i].morph, props)
|
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
cached.is_lex = False
|
cached.is_lex = False
|
||||||
|
@ -279,3 +268,23 @@ cdef class Tokenizer:
|
||||||
hashed = hash_string(chunk)
|
hashed = hash_string(chunk)
|
||||||
self._specials.set(hashed, cached)
|
self._specials.set(hashed, cached)
|
||||||
self._cache.set(hashed, cached)
|
self._cache.set(hashed, cached)
|
||||||
|
|
||||||
|
|
||||||
|
#if lemma is not None:
|
||||||
|
# tokens[i].lemma = self.vocab.strings[lemma]
|
||||||
|
#else:
|
||||||
|
# tokens[i].lemma = 0
|
||||||
|
#if 'pos' in props:
|
||||||
|
# inflection = self.vocab.morphology.get(props['pos'])
|
||||||
|
# inflection.assign(&tokens[i])
|
||||||
|
# # These are defaults, which can be over-ridden by the
|
||||||
|
# # token-specific props.
|
||||||
|
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
|
||||||
|
# #tokens[i].pos = pos
|
||||||
|
# ## These are defaults, which can be over-ridden by the
|
||||||
|
# ## token-specific props.
|
||||||
|
# #set_morph_from_dict(&tokens[i].morph, morph_features)
|
||||||
|
# #if tokens[i].lemma == 0:
|
||||||
|
# # tokens[i].lemma = tokens[i].lex.orth
|
||||||
|
##set_morph_from_dict(&tokens[i].morph, props)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue