From 6ae645c4ef67449992463979c5539118a3699a5e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Sep 2018 23:57:41 +0200 Subject: [PATCH] WIP on supporting morphology features --- spacy/morphology.pxd | 81 ++++++--- spacy/morphology.pyx | 402 +++++++++++++++++++++++++++++++++---------- 2 files changed, 366 insertions(+), 117 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index d0110b300..2220cfcfc 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,48 +1,30 @@ from cymem.cymem cimport Pool -from preshed.maps cimport PreshMapArray +from preshed.maps cimport PreshMap from libc.stdint cimport uint64_t +from murmurhash cimport mrmr from .structs cimport TokenC from .strings cimport StringStore -from .typedefs cimport attr_t, flags_t +from .typedefs cimport hash_t, attr_t, flags_t from .parts_of_speech cimport univ_pos_t from . cimport symbols - -cdef struct RichTagC: - uint64_t morph - int id - univ_pos_t pos - attr_t name - - -cdef struct MorphAnalysisC: - RichTagC tag - attr_t lemma - - cdef class Morphology: cdef readonly Pool mem cdef readonly StringStore strings + cdef PreshMap tags # Keyed by hash, value is pointer to tag + cdef public object lemmatizer cdef readonly object tag_map - cdef public object n_tags - cdef public object reverse_index - cdef public object tag_names - cdef public object exc - - cdef RichTagC* rich_tags - cdef PreshMapArray _cache + cdef hash_t insert(self, RichTagC tag) except 0 + cdef int assign_untagged(self, TokenC* token) except -1 - cdef int assign_tag(self, TokenC* token, tag) except -1 - cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 - - cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 - + cdef update_token_morph(self, TokenC* token, features) + cdef set_token_morph(self, TokenC* token, pos, features) cdef enum univ_morph_t: NIL = 0 @@ -298,4 +280,47 @@ cdef enum univ_morph_t: VerbType_mod # U VerbType_light # U - +cdef struct RichTagC: + univ_pos_t pos + + univ_morph_t abbr + univ_morph_t adp_type + univ_morph_t adv_type + univ_morph_t animacy + univ_morph_t aspect + univ_morph_t case + univ_morph_t conj_type + univ_morph_t connegative + univ_morph_t definite + univ_morph_t degree + univ_morph_t derivation + univ_morph_t echo + univ_morph_t foreign + univ_morph_t gender + univ_morph_t hyph + univ_morph_t inf_form + univ_morph_t mood + univ_morph_t negative + univ_morph_t number + univ_morph_t name_type + univ_morph_t num_form + univ_morph_t num_type + univ_morph_t num_value + univ_morph_t part_form + univ_morph_t part_type + univ_morph_t person + univ_morph_t polite + univ_morph_t polarity + univ_morph_t poss + univ_morph_t prefix + univ_morph_t prep_case + univ_morph_t pron_type + univ_morph_t punct_side + univ_morph_t punct_type + univ_morph_t reflex + univ_morph_t style + univ_morph_t style_variant + univ_morph_t tense + univ_morph_t verb_form + univ_morph_t voice + univ_morph_t verb_type diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index bd821d76f..3b74ecaae 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -3,6 +3,7 @@ from __future__ import unicode_literals from libc.string cimport memset +import ujson as json from .attrs cimport POS, IS_SPACE from .attrs import LEMMA, intify_attrs @@ -12,6 +13,7 @@ from .lexeme cimport Lexeme from .errors import Errors + def _normalize_props(props): """Transform deprecated string keys to correct names.""" out = {} @@ -32,9 +34,17 @@ def _normalize_props(props): cdef class Morphology: + '''Store the possible morphological analyses for a language, and index them + by hash. + + To save space on each token, tokens only know the hash of their morphological + analysis, so queries of morphological attributes are delegated + to this class. + ''' def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + self.tags = PreshMap() # Add special space symbol. We prefix with underscore, to make sure it # always sorts to the end. space_attrs = tag_map.get('SP', {POS: SPACE}) @@ -47,32 +57,46 @@ cdef class Morphology: self.lemmatizer = lemmatizer self.n_tags = len(tag_map) self.reverse_index = {} - - self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): - self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) - attrs = _normalize_props(attrs) - attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - self.rich_tags[i].id = i - self.rich_tags[i].name = self.strings.add(tag_str) - self.rich_tags[i].morph = 0 - self.rich_tags[i].pos = attrs[POS] - self.reverse_index[self.rich_tags[i].name] = i - # Add a 'null' tag, which we can reference when assign morphology to - # untagged tokens. - self.rich_tags[self.n_tags].id = self.n_tags + self.reverse_index[i] = self.strings.add(tag_str) self._cache = PreshMapArray(self.n_tags) self.exc = {} if exc is not None: for (tag_str, orth_str), attrs in exc.items(): self.add_special_case(tag_str, orth_str, attrs) + + def add(self, features): + """Insert a morphological analysis in the morphology table, if not already + present. Returns the hash of the new analysis. + """ + features = intify_features(self.strings, features) + cdef RichTagC tag = create_rich_tag(features) + cdef hash_t key = self.insert(tag) + return key - def __reduce__(self): - return (Morphology, (self.strings, self.tag_map, self.lemmatizer, - self.exc), None, None) - + def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): + if orth not in self.strings: + return orth + cdef unicode py_string = self.strings[orth] + if self.lemmatizer is None: + return self.strings.add(py_string.lower()) + cdef list lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) + lemma_string = lemma_strings[0] + lemma = self.strings.add(lemma_string) + return lemma + + cdef hash_t insert(self, RichTagC tag) except 0: + cdef hash_t key = hash_tag(tag) + if self.tags.get(key) == NULL: + tag_ptr = self.mem.alloc(1, sizeof(RichTagC)) + tag_ptr[0] = tag + self.tags.set(key, tag_ptr) + return key + cdef int assign_untagged(self, TokenC* token) except -1: """Set morphological attributes on a token without a POS tag. Uses the lemmatizer's lookup() method, which looks up the string in the @@ -101,84 +125,284 @@ cdef class Morphology: # figure out why the statistical model fails. Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): tag_id = self.reverse_index[self.strings.add('_SP')] - rich_tag = self.rich_tags[tag_id] - analysis = self._cache.get(tag_id, token.lex.orth) - if analysis is NULL: - analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) - tag_str = self.strings[self.rich_tags[tag_id].name] - analysis.tag = rich_tag - analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, - self.tag_map.get(tag_str, {})) - self._cache.set(tag_id, token.lex.orth, analysis) - token.lemma = analysis.lemma - token.pos = analysis.tag.pos - token.tag = analysis.tag.name - token.morph = analysis.tag.morph + lemma = self._cache.get(tag_id, token.lex.orth) + if lemma == 0: + tag_str = self.tag_names[tag_id] + features = dict(self.tag_map.get(tag_str, {})) + pos = self.strings.as_int(features.pop('POS')) + lemma = self.lemmatize(pos, token.lex.orth, features) + self._cache.set(tag_id, token.lex.orth, lemma) + token.lemma = lemma + token.pos = pos + token.tag = self.strings[tag_str] + token.morph = self.add(attrs) - cdef int assign_feature(self, uint64_t* flags, univ_morph_t flag_id, bint value) except -1: - cdef flags_t one = 1 - if value: - flags[0] |= one << flag_id - else: - flags[0] &= ~(one << flag_id) + cdef update_morph(self, hash_t morph, features): + """Update a morphological analysis with new feature values.""" + tag = (self.tags.get(morph))[0] + cdef univ_morph_t feature + cdef int value + for feature_, value in features.items(): + feature = self.strings.as_int(feature_) + set_feature(&tag, feature, 1) + morph = self.insert_tag(tag) + return morph - def add_special_case(self, unicode tag_str, unicode orth_str, attrs, - force=False): - """Add a special-case rule to the morphological analyser. Tokens whose - tag and orth match the rule will receive the specified properties. + def to_bytes(self): + json_tags = [] + for key in self.tags: + tag_ptr = self.tags.get(key) + if tag_ptr != NULL: + json_tags.append(tag_to_json(tag_ptr[0])) + raise json.dumps(json_tags) - tag (unicode): The part-of-speech tag to key the exception. - orth (unicode): The word-form to key the exception. - """ - # TODO: Currently we've assumed that we know the number of tags -- - # RichTagC is an array, and _cache is a PreshMapArray - # This is really bad: it makes the morphology typed to the tagger - # classes, which is all wrong. - self.exc[(tag_str, orth_str)] = dict(attrs) - tag = self.strings.add(tag_str) - if tag not in self.reverse_index: - return - tag_id = self.reverse_index[tag] - orth = self.strings[orth_str] - cdef RichTagC rich_tag = self.rich_tags[tag_id] - attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) - cached = self._cache.get(tag_id, orth) - if cached is NULL: - cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) - elif force: - memset(cached, 0, sizeof(cached[0])) - else: - raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) + def from_bytes(self, byte_string): + raise NotImplementedError - cached.tag = rich_tag - # TODO: Refactor this to take arbitrary attributes. - for name_id, value_id in attrs.items(): - if name_id == LEMMA: - cached.lemma = value_id - else: - self.assign_feature(&cached.tag.morph, name_id, value_id) - if cached.lemma == 0: - cached.lemma = self.lemmatize(rich_tag.pos, orth, attrs) - self._cache.set(tag_id, orth, cached) + def to_disk(self, path): + raise NotImplementedError + + def from_disk(self, path): + raise NotImplementedError + + +cpdef univ_pos_t get_int_tag(pos_): + return 0 + +cpdef intify_features(StringStore strings, features): + return {strings.as_int(feature) for feature in features} + +cdef hash_t hash_tag(RichTagC tag) nogil: + return mrmr.hash64(&tag, sizeof(tag), 0) + +cdef RichTagC create_rich_tag(pos_, features): + cdef RichTagC tag + cdef univ_morph_t feature + tag.pos = get_int_tag(pos_) + for feature in features: + set_feature(&tag, feature, 1) + return tag + +cdef tag_to_json(RichTagC tag): + return {} + +cdef RichTagC tag_from_json(json_tag): + cdef RichTagC tag + return tag + +cdef int set_feature(RichTagC* tag, univ_morph_t feature, int value) nogil: + if value == True: + value_ = feature + else: + value_ = NIL + if feature == NIL: + pass + if is_abbr_feature(feature): + tag.abbr = value_ + elif is_adp_type_feature(feature): + tag.adp_type = value_ + elif is_adv_type_feature(feature): + tag.adv_type = value_ + elif is_animacy_feature(feature): + tag.animacy = value_ + elif is_aspect_feature(feature): + tag.aspect = value_ + elif is_case_feature(feature): + tag.case = value_ + elif is_conj_type_feature(feature): + tag.conj_type = value_ + elif is_connegative_feature(feature): + tag.connegative = value_ + elif is_definite_feature(feature): + tag.definite = value_ + elif is_degree_feature(feature): + tag.degree = value_ + elif is_derivation_feature(feature): + tag.derivation = value_ + elif is_echo_feature(feature): + tag.echo = value_ + elif is_foreign_feature(feature): + tag.foreign = value_ + elif is_gender_feature(feature): + tag.gender = value_ + elif is_hyph_feature(feature): + tag.hyph = value_ + elif is_inf_form_feature(feature): + tag.inf_form = value_ + elif is_mood_feature(feature): + tag.mood = value_ + elif is_negative_feature(feature): + tag.negative = value_ + elif is_number_feature(feature): + tag.number = value_ + elif is_name_type_feature(feature): + tag.name_type = value_ + elif is_num_form_feature(feature): + tag.num_form = value_ + elif is_num_value_feature(feature): + tag.num_value = value_ + elif is_part_form_feature(feature): + tag.part_form = value_ + elif is_part_type_feature(feature): + tag.part_type = value_ + elif is_person_feature(feature): + tag.person = value_ + elif is_polite_feature(feature): + tag.polite = value_ + elif is_polarity_feature(feature): + tag.polarity = value_ + elif is_poss_feature(feature): + tag.poss = value_ + elif is_prefix_feature(feature): + tag.prefix = value_ + elif is_prep_case_feature(feature): + tag.prep_case = value_ + elif is_pron_type_feature(feature): + tag.pron_type = value_ + elif is_punct_side_feature(feature): + tag.punct_type = value_ + elif is_reflex_feature(feature): + tag.reflex = value_ + elif is_style_feature(feature): + tag.style = value_ + elif is_style_variant_feature(feature): + tag.style_variant = value_ + elif is_tense_feature(feature): + tag.tense = value_ + elif is_verb_form_feature(feature): + tag.verb_form = value_ + elif is_voice_feature(feature): + tag.voice = value_ + elif is_verb_type_feature(feature): + tag.verb_type = value_ + else: + with gil: + raise ValueError("Unknown feature: %d" % feature) + +cdef int is_abbr_feature(univ_morph_t abbr) nogil: + return 0 + +cdef int is_adp_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_adv_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_animacy_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_aspect_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_case_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_conj_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_connegative_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_definite_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_degree_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_derivation_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_echo_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_foreign_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_gender_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_hyph_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_inf_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_mood_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_negative_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_number_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_name_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_num_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_num_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_num_value_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_part_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_part_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_person_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_polite_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_polarity_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_poss_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_prefix_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_prep_case_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_pron_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_punct_side_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_punct_type_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_reflex_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_style_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_style_variant_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_tense_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_verb_form_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_voice_feature(univ_morph_t feature) nogil: + return 0 + +cdef int is_verb_type_feature(univ_morph_t feature) nogil: + return 0 - def load_morph_exceptions(self, dict exc): - # Map (form, pos) to (lemma, rich tag) - for tag_str, entries in exc.items(): - for form_str, attrs in entries.items(): - self.add_special_case(tag_str, form_str, attrs) - def lemmatize(self, const univ_pos_t univ_pos, attr_t orth, morphology): - if orth not in self.strings: - return orth - cdef unicode py_string = self.strings[orth] - if self.lemmatizer is None: - return self.strings.add(py_string.lower()) - cdef list lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) - lemma_string = lemma_strings[0] - lemma = self.strings.add(lemma_string) - return lemma IDS = {