* Hack Morphology class towards usability

2015-08-26 19:17:21 +02:00 · 2015-08-26 19:17:21 +02:00 · 378729f81a
parent 430affc347
commit 378729f81a
1 changed files with 127 additions and 9 deletions
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,11 +1,129 @@
-# cython: embedsignature=True
+from os import path
+
+try:
+    import ujson as json
+except ImportError:
+    import json
+
+from spacy.parts_of_speech import UNIV_POS_NAMES
    

-cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
-    morph.number = props.get('number', 0)
-    morph.tenspect = props.get('tenspect', 0)
-    morph.mood = props.get('mood', 0)
-    morph.gender = props.get('gender', 0)
-    morph.person = props.get('person', 0)
-    morph.case = props.get('case', 0)
-    morph.misc = props.get('misc', 0)
+cdef class Morphology:
+    def __init__(self, tag_map, fused_tokens, lemmatizer):
+        self.tag_map = tag_map
+        self.n_tags = len(tag_map)
+        self.tag_names = tuple(sorted(tag_map.keys()))
+        self.tag_ids = {}
+        for i, tag_str in enumerate(self.tag_names):
+            self.tag_ids[tag_str] = i
+
+    @classmethod
+    def from_dir(cls, data_dir):
+        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
+        return cls(tag_map, {}, None)
+
+    cdef int assign_tag(self, TokenC* token, int tag) except -1:
+        props = self.tag_map[self.tag_names[tag]]
+        token.pos = UNIV_POS_NAMES[props['pos'].upper()]
+        token.tag = tag
+        #token.inflection = # TODO
+
+    cdef int assign_from_dict(self, TokenC* token, props) except -1:
+        pass
+
+    def load_morph_exceptions(self, dict exc):
+        pass
+        # Map (form, pos) to (lemma, inflection)
+        #cdef unicode pos_str
+        #cdef unicode form_str
+        #cdef unicode lemma_str
+        #cdef dict entries
+        #cdef dict props
+        #cdef int lemma
+        #cdef attr_t orth
+        #cdef int pos
+        #for pos_str, entries in exc.items():
+        #    pos = self.tag_names.index(pos_str)
+        #    for form_str, props in entries.items():
+        #        lemma_str = props.get('L', form_str)
+        #        orth = self.strings[form_str]
+        #        cached = <InflectedLemma*>self.mem.alloc(1, sizeof(InflectedLemma))
+        #        cached.lemma = self.strings[lemma_str]
+        #        set_morph_from_dict(&cached.morph, props)
+        #        self._morph_cache.set(pos, orth, <void*>cached)
+
+
+#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+#    morph.number = props.get('number', 0)
+#    morph.tenspect = props.get('tenspect', 0)
+#    morph.mood = props.get('mood', 0)
+#    morph.gender = props.get('gender', 0)
+#    morph.person = props.get('person', 0)
+#    morph.case = props.get('case', 0)
+#    morph.misc = props.get('misc', 0)
+#
+#
+#cdef class Morphology:
+#    cdef Pool mem
+#    cdef PreshMap table
+#
+#    def __init__(self, tags, exceptions):
+#        pass
+#
+#    def __getitem__(self, hash_t id_):
+#        pass
+#
+#    cdef const InflectionC* get(self, hash_t key) except NULL:
+#        pass
+#
+#    cdef MorphAnalysis analyse(const TokenC* token) except -1:
+#        cdef struct MorphAnalysis morphology
+#        tokens[i].pos = tag.pos
+#        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
+#        if cached is NULL:
+#            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
+#            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
+#            cached.morph = tag.morph
+#            self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
+#        tokens[i].lemma = cached.lemma
+#        tokens[i].morph = cached.morph
+#        
+#    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
+#        if self.lemmatizer is None:
+#            return lex.orth
+#        cdef unicode py_string = self.strings[lex.orth]
+#        if pos != NOUN and pos != VERB and pos != ADJ:
+#            return lex.orth
+#        cdef set lemma_strings
+#        cdef unicode lemma_string
+#        lemma_strings = self.lemmatizer(py_string, pos)
+#        lemma_string = sorted(lemma_strings)[0]
+#        lemma = self.strings[lemma_string]
+#        return lemma
+#        
+#
+#cdef class Inflection:
+#    cdef InflectionC* c
+#
+#    def __init__(self, container, id_):
+#        self.c = container[id_]
+#        self.container = container
+#        
+#        for i, feat_id in enumerate(feat_ids):
+#            feature, value = parse_id(feat_id)
+#            self.add_value(feature, value, True)
+#
+#    def has(self, Value_t feat_value_id):
+#        part = feat_value_id % 64
+#        bit = feat_value_id / 64
+#        if self.value_set[part] & bit:
+#            return True
+#        else:
+#            return False
+#
+#    property pos: def __get__(self): return self.c.pos
+#
+#    property id: def __get__(self): return self.c.id
+#
+#    property features:
+#        pass