from os import path from .lemmatizer import Lemmatizer try: import ujson as json except ImportError: import json from spacy.parts_of_speech import UNIV_POS_NAMES cdef class Morphology: @classmethod def from_dir(cls, data_dir, lemmatizer=None): tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) if lemmatizer is None: lemmatizer = Lemmatizer.from_dir(data_dir) return cls(tag_map, {}, lemmatizer) def __init__(self, tag_map, fused_tokens, lemmatizer): self.lemmatizer = lemmatizer self.tag_map = tag_map self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) self.tag_ids = {} for i, tag_str in enumerate(self.tag_names): self.tag_ids[tag_str] = i cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1: # TODO Caching props = self.tag_map[self.tag_names[tag]] token.pos = UNIV_POS_NAMES[props['pos'].upper()] token.tag = strings[self.tag_names[tag]] lemma = self.lemmatizer(strings[token.lex.orth], token.pos) token.lemma = strings[lemma] #token.inflection = # TODO cdef int assign_from_dict(self, TokenC* token, props) except -1: pass def load_morph_exceptions(self, dict exc): pass # Map (form, pos) to (lemma, inflection) #cdef unicode pos_str #cdef unicode form_str #cdef unicode lemma_str #cdef dict entries #cdef dict props #cdef int lemma #cdef attr_t orth #cdef int pos #for pos_str, entries in exc.items(): # pos = self.tag_names.index(pos_str) # for form_str, props in entries.items(): # lemma_str = props.get('L', form_str) # orth = self.strings[form_str] # cached = self.mem.alloc(1, sizeof(InflectedLemma)) # cached.lemma = self.strings[lemma_str] # set_morph_from_dict(&cached.morph, props) # self._morph_cache.set(pos, orth, cached) #cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: # morph.number = props.get('number', 0) # morph.tenspect = props.get('tenspect', 0) # morph.mood = props.get('mood', 0) # morph.gender = props.get('gender', 0) # morph.person = props.get('person', 0) # morph.case = props.get('case', 0) # morph.misc = props.get('misc', 0) # # #cdef class Morphology: # cdef Pool mem # cdef PreshMap table # # def __init__(self, tags, exceptions): # pass # # def __getitem__(self, hash_t id_): # pass # # cdef const InflectionC* get(self, hash_t key) except NULL: # pass # # cdef MorphAnalysis analyse(const TokenC* token) except -1: # cdef struct MorphAnalysis morphology # tokens[i].pos = tag.pos # cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) # if cached is NULL: # cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) # cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) # cached.morph = tag.morph # self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) # tokens[i].lemma = cached.lemma # tokens[i].morph = cached.morph # # cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: # if self.lemmatizer is None: # return lex.orth # cdef unicode py_string = self.strings[lex.orth] # if pos != NOUN and pos != VERB and pos != ADJ: # return lex.orth # cdef set lemma_strings # cdef unicode lemma_string # lemma_strings = self.lemmatizer(py_string, pos) # lemma_string = sorted(lemma_strings)[0] # lemma = self.strings[lemma_string] # return lemma # # #cdef class Inflection: # cdef InflectionC* c # # def __init__(self, container, id_): # self.c = container[id_] # self.container = container # # for i, feat_id in enumerate(feat_ids): # feature, value = parse_id(feat_id) # self.add_value(feature, value, True) # # def has(self, Value_t feat_value_id): # part = feat_value_id % 64 # bit = feat_value_id / 64 # if self.value_set[part] & bit: # return True # else: # return False # # property pos: def __get__(self): return self.c.pos # # property id: def __get__(self): return self.c.id # # property features: # pass