* Hack out morphology stuff from tokenizer, while morphology being reimplemented.

2015-08-26 19:20:11 +02:00 · 2015-08-26 19:20:11 +02:00 · 119c0f8c3f
parent b4faf551f5
commit 119c0f8c3f
2 changed files with 28 additions and 19 deletions
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
-from .structs cimport LexemeC, TokenC, Morphology
+from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab, _Cached
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from .morphology cimport set_morph_from_dict
 from .strings cimport hash_string
 cimport cython
@ -29,7 +28,7 @@ cdef class Tokenizer:
        self._suffix_re = suffix_re
        self._infix_re = infix_re
        self.vocab = vocab
-        self._load_special_tokenization(rules, self.vocab.pos_tags)
+        self._load_special_tokenization(rules)
    @classmethod
    def from_dir(cls, Vocab vocab, data_dir):
@ -242,7 +241,7 @@ cdef class Tokenizer:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
-    def _load_special_tokenization(self, object rules, object tag_map):
+    def _load_special_tokenization(self, special_cases):
        '''Add a special-case tokenization rule.
        '''
        cdef int i
@ -253,25 +252,15 @@ cdef class Tokenizer:
        cdef dict props
        cdef LexemeC** lexemes
        cdef hash_t hashed
-        for chunk, substrings in sorted(rules.items()):
+        for chunk, substrings in sorted(special_cases.items()):
            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
            for i, props in enumerate(substrings):
                form = props['F']
                lemma = props.get("L", None)
                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
-                if lemma is not None:
+                lemma = props.get('L', form)
-                    tokens[i].lemma = self.vocab.strings[lemma]
+                tokens[i].lemma = self.vocab.strings[lemma]
-                else:
+                #TODO
-                    tokens[i].lemma = 0
+                #self.vocab.morphology.assign_from_dict(&tokens[i], props)
                if 'pos' in props:
                    tokens[i].tag = self.vocab.strings[props['pos']]
                    tokens[i].pos = tag_map[props['pos']][0]
                    # These are defaults, which can be over-ridden by the
                    # token-specific props.
                    set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
                    if tokens[i].lemma == 0:
                        tokens[i].lemma = tokens[i].lex.orth
                set_morph_from_dict(&tokens[i].morph, props)
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)
            cached.is_lex = False
@ -279,3 +268,23 @@ cdef class Tokenizer:
            hashed = hash_string(chunk)
            self._specials.set(hashed, cached)
            self._cache.set(hashed, cached)
 #if lemma is not None:
 #    tokens[i].lemma = self.vocab.strings[lemma]
 #else:
 #    tokens[i].lemma = 0
 #if 'pos' in props:
 #    inflection = self.vocab.morphology.get(props['pos'])
 #    inflection.assign(&tokens[i])
 #    # These are defaults, which can be over-ridden by the
 #    # token-specific props.
 #    #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
 #    #tokens[i].pos = pos
 #    ## These are defaults, which can be over-ridden by the
 #    ## token-specific props.
 #    #set_morph_from_dict(&tokens[i].morph, morph_features)
 #    #if tokens[i].lemma == 0:
 #    #    tokens[i].lemma = tokens[i].lex.orth
 ##set_morph_from_dict(&tokens[i].morph, props)