From 378729f81af5025f6f45e68a95ca7f5eef24a1a2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:21 +0200 Subject: [PATCH] * Hack Morphology class towards usability --- spacy/morphology.pyx | 136 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 96a4ba884..f32009351 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,11 +1,129 @@ -# cython: embedsignature=True +from os import path + +try: + import ujson as json +except ImportError: + import json + +from spacy.parts_of_speech import UNIV_POS_NAMES + + +cdef class Morphology: + def __init__(self, tag_map, fused_tokens, lemmatizer): + self.tag_map = tag_map + self.n_tags = len(tag_map) + self.tag_names = tuple(sorted(tag_map.keys())) + self.tag_ids = {} + for i, tag_str in enumerate(self.tag_names): + self.tag_ids[tag_str] = i + + @classmethod + def from_dir(cls, data_dir): + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + return cls(tag_map, {}, None) + + cdef int assign_tag(self, TokenC* token, int tag) except -1: + props = self.tag_map[self.tag_names[tag]] + token.pos = UNIV_POS_NAMES[props['pos'].upper()] + token.tag = tag + #token.inflection = # TODO + + cdef int assign_from_dict(self, TokenC* token, props) except -1: + pass + + def load_morph_exceptions(self, dict exc): + pass + # Map (form, pos) to (lemma, inflection) + #cdef unicode pos_str + #cdef unicode form_str + #cdef unicode lemma_str + #cdef dict entries + #cdef dict props + #cdef int lemma + #cdef attr_t orth + #cdef int pos + #for pos_str, entries in exc.items(): + # pos = self.tag_names.index(pos_str) + # for form_str, props in entries.items(): + # lemma_str = props.get('L', form_str) + # orth = self.strings[form_str] + # cached = self.mem.alloc(1, sizeof(InflectedLemma)) + # cached.lemma = self.strings[lemma_str] + # set_morph_from_dict(&cached.morph, props) + # self._morph_cache.set(pos, orth, cached) -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: - morph.number = props.get('number', 0) - morph.tenspect = props.get('tenspect', 0) - morph.mood = props.get('mood', 0) - morph.gender = props.get('gender', 0) - morph.person = props.get('person', 0) - morph.case = props.get('case', 0) - morph.misc = props.get('misc', 0) +#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: +# morph.number = props.get('number', 0) +# morph.tenspect = props.get('tenspect', 0) +# morph.mood = props.get('mood', 0) +# morph.gender = props.get('gender', 0) +# morph.person = props.get('person', 0) +# morph.case = props.get('case', 0) +# morph.misc = props.get('misc', 0) +# +# +#cdef class Morphology: +# cdef Pool mem +# cdef PreshMap table +# +# def __init__(self, tags, exceptions): +# pass +# +# def __getitem__(self, hash_t id_): +# pass +# +# cdef const InflectionC* get(self, hash_t key) except NULL: +# pass +# +# cdef MorphAnalysis analyse(const TokenC* token) except -1: +# cdef struct MorphAnalysis morphology +# tokens[i].pos = tag.pos +# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) +# if cached is NULL: +# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) +# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) +# cached.morph = tag.morph +# self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) +# tokens[i].lemma = cached.lemma +# tokens[i].morph = cached.morph +# +# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: +# if self.lemmatizer is None: +# return lex.orth +# cdef unicode py_string = self.strings[lex.orth] +# if pos != NOUN and pos != VERB and pos != ADJ: +# return lex.orth +# cdef set lemma_strings +# cdef unicode lemma_string +# lemma_strings = self.lemmatizer(py_string, pos) +# lemma_string = sorted(lemma_strings)[0] +# lemma = self.strings[lemma_string] +# return lemma +# +# +#cdef class Inflection: +# cdef InflectionC* c +# +# def __init__(self, container, id_): +# self.c = container[id_] +# self.container = container +# +# for i, feat_id in enumerate(feat_ids): +# feature, value = parse_id(feat_id) +# self.add_value(feature, value, True) +# +# def has(self, Value_t feat_value_id): +# part = feat_value_id % 64 +# bit = feat_value_id / 64 +# if self.value_set[part] & bit: +# return True +# else: +# return False +# +# property pos: def __get__(self): return self.c.pos +# +# property id: def __get__(self): return self.c.id +# +# property features: +# pass