mirror of https://github.com/explosion/spaCy.git
* Hack out morphology stuff from tokenizer, while morphology being reimplemented.
This commit is contained in:
parent
b4faf551f5
commit
119c0f8c3f
|
@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
|
|||
from cymem.cymem cimport Pool
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .structs cimport LexemeC, TokenC, Morphology
|
||||
from .structs cimport LexemeC, TokenC
|
||||
from .strings cimport StringStore
|
||||
from .tokens.doc cimport Doc
|
||||
from .vocab cimport Vocab, _Cached
|
||||
|
|
|
@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMap
|
||||
|
||||
from .morphology cimport set_morph_from_dict
|
||||
from .strings cimport hash_string
|
||||
cimport cython
|
||||
|
||||
|
@ -29,7 +28,7 @@ cdef class Tokenizer:
|
|||
self._suffix_re = suffix_re
|
||||
self._infix_re = infix_re
|
||||
self.vocab = vocab
|
||||
self._load_special_tokenization(rules, self.vocab.pos_tags)
|
||||
self._load_special_tokenization(rules)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, Vocab vocab, data_dir):
|
||||
|
@ -242,7 +241,7 @@ cdef class Tokenizer:
|
|||
match = self._suffix_re.search(string)
|
||||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, object rules, object tag_map):
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
'''Add a special-case tokenization rule.
|
||||
'''
|
||||
cdef int i
|
||||
|
@ -253,25 +252,15 @@ cdef class Tokenizer:
|
|||
cdef dict props
|
||||
cdef LexemeC** lexemes
|
||||
cdef hash_t hashed
|
||||
for chunk, substrings in sorted(rules.items()):
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
|
||||
for i, props in enumerate(substrings):
|
||||
form = props['F']
|
||||
lemma = props.get("L", None)
|
||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
|
||||
if lemma is not None:
|
||||
lemma = props.get('L', form)
|
||||
tokens[i].lemma = self.vocab.strings[lemma]
|
||||
else:
|
||||
tokens[i].lemma = 0
|
||||
if 'pos' in props:
|
||||
tokens[i].tag = self.vocab.strings[props['pos']]
|
||||
tokens[i].pos = tag_map[props['pos']][0]
|
||||
# These are defaults, which can be over-ridden by the
|
||||
# token-specific props.
|
||||
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
|
||||
if tokens[i].lemma == 0:
|
||||
tokens[i].lemma = tokens[i].lex.orth
|
||||
set_morph_from_dict(&tokens[i].morph, props)
|
||||
#TODO
|
||||
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
cached.is_lex = False
|
||||
|
@ -279,3 +268,23 @@ cdef class Tokenizer:
|
|||
hashed = hash_string(chunk)
|
||||
self._specials.set(hashed, cached)
|
||||
self._cache.set(hashed, cached)
|
||||
|
||||
|
||||
#if lemma is not None:
|
||||
# tokens[i].lemma = self.vocab.strings[lemma]
|
||||
#else:
|
||||
# tokens[i].lemma = 0
|
||||
#if 'pos' in props:
|
||||
# inflection = self.vocab.morphology.get(props['pos'])
|
||||
# inflection.assign(&tokens[i])
|
||||
# # These are defaults, which can be over-ridden by the
|
||||
# # token-specific props.
|
||||
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
|
||||
# #tokens[i].pos = pos
|
||||
# ## These are defaults, which can be over-ridden by the
|
||||
# ## token-specific props.
|
||||
# #set_morph_from_dict(&tokens[i].morph, morph_features)
|
||||
# #if tokens[i].lemma == 0:
|
||||
# # tokens[i].lemma = tokens[i].lex.orth
|
||||
##set_morph_from_dict(&tokens[i].morph, props)
|
||||
|
||||
|
|
Loading…
Reference in New Issue