diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a7f69c5aa..19b8aa026 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC, Morphology +from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, _Cached diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1e857aefc..38daf1c5a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -from .morphology cimport set_morph_from_dict from .strings cimport hash_string cimport cython @@ -29,7 +28,7 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules, self.vocab.pos_tags) + self._load_special_tokenization(rules) @classmethod def from_dir(cls, Vocab vocab, data_dir): @@ -242,7 +241,7 @@ cdef class Tokenizer: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, object rules, object tag_map): + def _load_special_tokenization(self, special_cases): '''Add a special-case tokenization rule. ''' cdef int i @@ -253,25 +252,15 @@ cdef class Tokenizer: cdef dict props cdef LexemeC** lexemes cdef hash_t hashed - for chunk, substrings in sorted(rules.items()): + for chunk, substrings in sorted(special_cases.items()): tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): form = props['F'] - lemma = props.get("L", None) tokens[i].lex = self.vocab.get(self.vocab.mem, form) - if lemma is not None: - tokens[i].lemma = self.vocab.strings[lemma] - else: - tokens[i].lemma = 0 - if 'pos' in props: - tokens[i].tag = self.vocab.strings[props['pos']] - tokens[i].pos = tag_map[props['pos']][0] - # These are defaults, which can be over-ridden by the - # token-specific props. - set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) - if tokens[i].lemma == 0: - tokens[i].lemma = tokens[i].lex.orth - set_morph_from_dict(&tokens[i].morph, props) + lemma = props.get('L', form) + tokens[i].lemma = self.vocab.strings[lemma] + #TODO + #self.vocab.morphology.assign_from_dict(&tokens[i], props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False @@ -279,3 +268,23 @@ cdef class Tokenizer: hashed = hash_string(chunk) self._specials.set(hashed, cached) self._cache.set(hashed, cached) + + +#if lemma is not None: +# tokens[i].lemma = self.vocab.strings[lemma] +#else: +# tokens[i].lemma = 0 +#if 'pos' in props: +# inflection = self.vocab.morphology.get(props['pos']) +# inflection.assign(&tokens[i]) +# # These are defaults, which can be over-ridden by the +# # token-specific props. +# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']] +# #tokens[i].pos = pos +# ## These are defaults, which can be over-ridden by the +# ## token-specific props. +# #set_morph_from_dict(&tokens[i].morph, morph_features) +# #if tokens[i].lemma == 0: +# # tokens[i].lemma = tokens[i].lex.orth +##set_morph_from_dict(&tokens[i].morph, props) +