WIP on supporting morphology features

2018-09-24 23:57:41 +02:00 · 2018-09-24 23:57:41 +02:00 · 6ae645c4ef
parent ac5742223a
commit 6ae645c4ef
2 changed files with 366 additions and 117 deletions
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,48 +1,30 @@
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMapArray
+from preshed.maps cimport PreshMap
 from libc.stdint cimport uint64_t
+from murmurhash cimport mrmr

 from .structs cimport TokenC
 from .strings cimport StringStore
-from .typedefs cimport attr_t, flags_t
+from .typedefs cimport hash_t, attr_t, flags_t
 from .parts_of_speech cimport univ_pos_t

 from . cimport symbols

-
-cdef struct RichTagC:
-    uint64_t morph
-    int id
-    univ_pos_t pos
-    attr_t name
-
-
-cdef struct MorphAnalysisC:
-    RichTagC tag
-    attr_t lemma
-
-
 cdef class Morphology:
    cdef readonly Pool mem
    cdef readonly StringStore strings
+    cdef PreshMap tags # Keyed by hash, value is pointer to tag
+ 
    cdef public object lemmatizer
    cdef readonly object tag_map
-    cdef public object n_tags
-    cdef public object reverse_index
-    cdef public object tag_names
-    cdef public object exc

-    cdef RichTagC* rich_tags
-    cdef PreshMapArray _cache
+    cdef hash_t insert(self, RichTagC tag) except 0
    
    cdef int assign_untagged(self, TokenC* token) except -1
-
    cdef int assign_tag(self, TokenC* token, tag) except -1
-
    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
-
-    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
-
+    cdef update_token_morph(self, TokenC* token, features)
+    cdef set_token_morph(self, TokenC* token, pos, features)

 cdef enum univ_morph_t:
    NIL = 0
@ -298,4 +280,47 @@ cdef enum univ_morph_t:
    VerbType_mod # U
    VerbType_light # U

+cdef struct RichTagC:
+    univ_pos_t pos
    
+    univ_morph_t abbr
+    univ_morph_t adp_type
+    univ_morph_t adv_type
+    univ_morph_t animacy
+    univ_morph_t aspect
+    univ_morph_t case
+    univ_morph_t conj_type
+    univ_morph_t connegative
+    univ_morph_t definite
+    univ_morph_t degree
+    univ_morph_t derivation
+    univ_morph_t echo
+    univ_morph_t foreign
+    univ_morph_t gender
+    univ_morph_t hyph
+    univ_morph_t inf_form
+    univ_morph_t mood
+    univ_morph_t negative
+    univ_morph_t number
+    univ_morph_t name_type
+    univ_morph_t num_form
+    univ_morph_t num_type
+    univ_morph_t num_value
+    univ_morph_t part_form
+    univ_morph_t part_type
+    univ_morph_t person
+    univ_morph_t polite
+    univ_morph_t polarity
+    univ_morph_t poss
+    univ_morph_t prefix
+    univ_morph_t prep_case
+    univ_morph_t pron_type
+    univ_morph_t punct_side
+    univ_morph_t punct_type
+    univ_morph_t reflex
+    univ_morph_t style
+    univ_morph_t style_variant
+    univ_morph_t tense
+    univ_morph_t verb_form
+    univ_morph_t voice
+    univ_morph_t verb_type
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -3,6 +3,7 @@
 from __future__ import unicode_literals

 from libc.string cimport memset
+import ujson as json

 from .attrs cimport POS, IS_SPACE
 from .attrs import LEMMA, intify_attrs
@ -12,6 +13,7 @@ from .lexeme cimport Lexeme
 from .errors import Errors


+
 def _normalize_props(props):
    """Transform deprecated string keys to correct names."""
    out = {}
@ -32,9 +34,17 @@ def _normalize_props(props):


 cdef class Morphology:
+    '''Store the possible morphological analyses for a language, and index them
+    by hash.
+    
+    To save space on each token, tokens only know the hash of their morphological
+    analysis, so queries of morphological attributes are delegated
+    to this class.
+    '''
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
        self.mem = Pool()
        self.strings = string_store
+        self.tags = PreshMap()
        # Add special space symbol. We prefix with underscore, to make sure it
        # always sorts to the end.
        space_attrs = tag_map.get('SP', {POS: SPACE})
@ -47,21 +57,9 @@ cdef class Morphology:
        self.lemmatizer = lemmatizer
        self.n_tags = len(tag_map)
        self.reverse_index = {}
-
-        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
-            self.strings.add(tag_str)
            self.tag_map[tag_str] = dict(attrs)
-            attrs = _normalize_props(attrs)
-            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
-            self.rich_tags[i].id = i
-            self.rich_tags[i].name = self.strings.add(tag_str)
-            self.rich_tags[i].morph = 0
-            self.rich_tags[i].pos = attrs[POS]
-            self.reverse_index[self.rich_tags[i].name] = i
-        # Add a 'null' tag, which we can reference when assign morphology to
-        # untagged tokens.
-        self.rich_tags[self.n_tags].id = self.n_tags
+            self.reverse_index[i] = self.strings.add(tag_str)

        self._cache = PreshMapArray(self.n_tags)
        self.exc = {}
@ -69,9 +67,35 @@ cdef class Morphology:
            for (tag_str, orth_str), attrs in exc.items():
                self.add_special_case(tag_str, orth_str, attrs)
    
-    def __reduce__(self):
-        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
-                             self.exc), None, None)
+    def add(self, features):
+        """Insert a morphological analysis in the morphology table, if not already
+        present. Returns the hash of the new analysis.
+        """
+        features = intify_features(self.strings, features)
+        cdef RichTagC tag = create_rich_tag(features)
+        cdef hash_t key = self.insert(tag)
+        return key
+
+    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
+        if orth not in self.strings:
+            return orth
+        cdef unicode py_string = self.strings[orth]
+        if self.lemmatizer is None:
+            return self.strings.add(py_string.lower())
+        cdef list lemma_strings
+        cdef unicode lemma_string
+        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
+        lemma_string = lemma_strings[0]
+        lemma = self.strings.add(lemma_string)
+        return lemma
+ 
+    cdef hash_t insert(self, RichTagC tag) except 0:
+        cdef hash_t key = hash_tag(tag)
+        if self.tags.get(key) == NULL:
+            tag_ptr = <RichTagC*>self.mem.alloc(1, sizeof(RichTagC))
+            tag_ptr[0] = tag
+            self.tags.set(key, <void*>tag_ptr)
+        return key
    
    cdef int assign_untagged(self, TokenC* token) except -1:
        """Set morphological attributes on a token without a POS tag. Uses
@ -101,84 +125,284 @@ cdef class Morphology:
        # figure out why the statistical model fails. Related to Issue #220
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
            tag_id = self.reverse_index[self.strings.add('_SP')]
-        rich_tag = self.rich_tags[tag_id]
-        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
-        if analysis is NULL:
-            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            tag_str = self.strings[self.rich_tags[tag_id].name]
-            analysis.tag = rich_tag
-            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
-                                            self.tag_map.get(tag_str, {}))
-            self._cache.set(tag_id, token.lex.orth, analysis)
-        token.lemma = analysis.lemma
-        token.pos = analysis.tag.pos
-        token.tag = analysis.tag.name
-        token.morph = analysis.tag.morph
+        lemma = <attr_t>self._cache.get(tag_id, token.lex.orth)
+        if lemma == 0:
+            tag_str = self.tag_names[tag_id]
+            features = dict(self.tag_map.get(tag_str, {}))
+            pos = self.strings.as_int(features.pop('POS'))
+            lemma = self.lemmatize(pos, token.lex.orth, features)
+            self._cache.set(tag_id, token.lex.orth, lemma)
+        token.lemma = lemma
+        token.pos = pos
+        token.tag = self.strings[tag_str]
+        token.morph = self.add(attrs)

-    cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1:
-        cdef flags_t one = 1
-        if value:
-            flags[0] |= one << flag_id
-        else:
-            flags[0] &= ~(one << flag_id)
+    cdef update_morph(self, hash_t morph, features):
+        """Update a morphological analysis with new feature values."""
+        tag = (<RichTagC*>self.tags.get(morph))[0]
+        cdef univ_morph_t feature
+        cdef int value
+        for feature_, value in features.items():
+            feature = self.strings.as_int(feature_)
+            set_feature(&tag, feature, 1)
+        morph = self.insert_tag(tag)
+        return morph

-    def add_special_case(self, unicode tag_str, unicode orth_str, attrs,
-                         force=False):
-        """Add a special-case rule to the morphological analyser. Tokens whose
-        tag and orth match the rule will receive the specified properties.
+    def to_bytes(self):
+        json_tags = []
+        for key in self.tags:
+            tag_ptr = <RichTagC*>self.tags.get(key)
+            if tag_ptr != NULL:
+                json_tags.append(tag_to_json(tag_ptr[0]))
+        raise json.dumps(json_tags)

-        tag (unicode): The part-of-speech tag to key the exception.
-        orth (unicode): The word-form to key the exception.
-        """
-        # TODO: Currently we've assumed that we know the number of tags --
-        # RichTagC is an array, and _cache is a PreshMapArray
-        # This is really bad: it makes the morphology typed to the tagger
-        # classes, which is all wrong.
-        self.exc[(tag_str, orth_str)] = dict(attrs)
-        tag = self.strings.add(tag_str)
-        if tag not in self.reverse_index:
-            return
-        tag_id = self.reverse_index[tag]
-        orth = self.strings[orth_str]
-        cdef RichTagC rich_tag = self.rich_tags[tag_id]
-        attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
-        cached = <MorphAnalysisC*>self._cache.get(tag_id, orth)
-        if cached is NULL:
-            cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-        elif force:
-            memset(cached, 0, sizeof(cached[0]))
-        else:
-            raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
+    def from_bytes(self, byte_string):
+        raise NotImplementedError

-        cached.tag = rich_tag
-        # TODO: Refactor this to take arbitrary attributes.
-        for name_id, value_id in attrs.items():
-            if name_id == LEMMA:
-                cached.lemma = value_id
-            else:
-                self.assign_feature(&cached.tag.morph, name_id, value_id)
-        if cached.lemma == 0:
-            cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs)
-        self._cache.set(tag_id, orth, <void*>cached)
+    def to_disk(self, path):
+        raise NotImplementedError
+
+    def from_disk(self, path):
+        raise NotImplementedError
+
+
+cpdef univ_pos_t get_int_tag(pos_):
+    return <univ_pos_t>0
+
+cpdef intify_features(StringStore strings, features):
+    return {strings.as_int(feature) for feature in features}
+
+cdef hash_t hash_tag(RichTagC tag) nogil:
+    return mrmr.hash64(&tag, sizeof(tag), 0)
+
+cdef RichTagC create_rich_tag(pos_, features):
+    cdef RichTagC tag
+    cdef univ_morph_t feature
+    tag.pos = get_int_tag(pos_)
+    for feature in features:
+        set_feature(&tag, feature, 1)
+    return tag
+
+cdef tag_to_json(RichTagC tag):
+    return {}
+
+cdef RichTagC tag_from_json(json_tag):
+    cdef RichTagC tag
+    return tag
+ 
+cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil:
+    if value == True:
+        value_ = feature
+    else:
+        value_ = NIL
+    if feature == NIL:
+        pass
+    if is_abbr_feature(feature):
+        tag.abbr = value_
+    elif is_adp_type_feature(feature):
+        tag.adp_type = value_
+    elif is_adv_type_feature(feature):
+        tag.adv_type = value_
+    elif is_animacy_feature(feature):
+        tag.animacy = value_
+    elif is_aspect_feature(feature):
+        tag.aspect = value_
+    elif is_case_feature(feature):
+        tag.case = value_
+    elif is_conj_type_feature(feature):
+        tag.conj_type = value_
+    elif is_connegative_feature(feature):
+        tag.connegative = value_
+    elif is_definite_feature(feature):
+        tag.definite = value_
+    elif is_degree_feature(feature):
+        tag.degree = value_
+    elif is_derivation_feature(feature):
+        tag.derivation = value_
+    elif is_echo_feature(feature):
+        tag.echo = value_
+    elif is_foreign_feature(feature):
+        tag.foreign = value_
+    elif is_gender_feature(feature):
+        tag.gender = value_
+    elif is_hyph_feature(feature):
+        tag.hyph = value_
+    elif is_inf_form_feature(feature):
+        tag.inf_form = value_
+    elif is_mood_feature(feature):
+        tag.mood = value_
+    elif is_negative_feature(feature):
+        tag.negative = value_
+    elif is_number_feature(feature):
+        tag.number = value_
+    elif is_name_type_feature(feature):
+        tag.name_type = value_
+    elif is_num_form_feature(feature):
+        tag.num_form = value_
+    elif is_num_value_feature(feature):
+        tag.num_value = value_
+    elif is_part_form_feature(feature):
+        tag.part_form = value_
+    elif is_part_type_feature(feature):
+        tag.part_type = value_
+    elif is_person_feature(feature):
+        tag.person = value_
+    elif is_polite_feature(feature):
+        tag.polite = value_
+    elif is_polarity_feature(feature):
+        tag.polarity = value_
+    elif is_poss_feature(feature):
+        tag.poss = value_
+    elif is_prefix_feature(feature):
+        tag.prefix = value_
+    elif is_prep_case_feature(feature):
+        tag.prep_case = value_
+    elif is_pron_type_feature(feature):
+        tag.pron_type = value_
+    elif is_punct_side_feature(feature):
+        tag.punct_type = value_
+    elif is_reflex_feature(feature):
+        tag.reflex = value_
+    elif is_style_feature(feature):
+        tag.style = value_
+    elif is_style_variant_feature(feature):
+        tag.style_variant = value_
+    elif is_tense_feature(feature):
+        tag.tense = value_
+    elif is_verb_form_feature(feature):
+        tag.verb_form = value_
+    elif is_voice_feature(feature):
+        tag.voice = value_
+    elif is_verb_type_feature(feature):
+        tag.verb_type = value_
+    else:
+        with gil:
+            raise ValueError("Unknown feature: %d" % feature)
+
+cdef int is_abbr_feature(univ_morph_t abbr) nogil:
+    return 0
+
+cdef int is_adp_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_adv_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_animacy_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_aspect_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_case_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_conj_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_connegative_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_definite_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_degree_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_derivation_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_echo_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_foreign_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_gender_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_hyph_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_inf_form_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_mood_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_negative_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_number_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_name_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_num_form_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_num_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_num_value_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_part_form_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_part_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_person_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_polite_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_polarity_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_poss_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_prefix_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_prep_case_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_pron_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_punct_side_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_punct_type_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_reflex_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_style_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_style_variant_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_tense_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_verb_form_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_voice_feature(univ_morph_t feature) nogil:
+    return 0
+
+cdef int is_verb_type_feature(univ_morph_t feature) nogil:
+    return 0

-    def load_morph_exceptions(self, dict exc):
-        # Map (form, pos) to (lemma, rich tag)
-        for tag_str, entries in exc.items():
-            for form_str, attrs in entries.items():
-                self.add_special_case(tag_str, form_str, attrs)

-    def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology):
-        if orth not in self.strings:
-            return orth
-        cdef unicode py_string = self.strings[orth]
-        if self.lemmatizer is None:
-            return self.strings.add(py_string.lower())
-        cdef list lemma_strings
-        cdef unicode lemma_string
-        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
-        lemma_string = lemma_strings[0]
-        lemma = self.strings.add(lemma_string)
-        return lemma


 IDS = {