diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py new file mode 100644 index 000000000..8e7173767 --- /dev/null +++ b/spacy/fi/__init__.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language + + +class Finnish(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') diff --git a/spacy/language.py b/spacy/language.py index 2a07d1f5f..36ca5c636 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -148,13 +148,10 @@ class Language(object): vectors = cls.default_vectors(data_dir) if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs(data_dir) - if morphology is None: - morphology = cls.default_morphology(path.join(data_dir, 'vocab')) return Vocab.from_dir( path.join(data_dir, 'vocab'), get_lex_attr=get_lex_attr, - vectors=vectors, - morphology=morphology) + vectors=vectors) @classmethod def default_tokenizer(cls, vocab, data_dir): diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index e0f85f96f..eb2bb97f5 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,18 +1,41 @@ +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMapArray +from libc.stdint cimport uint64_t + from .structs cimport TokenC from .strings cimport StringStore +from .typedefs cimport attr_t +from .parts_of_speech cimport univ_pos_t + + +cdef struct RichTagC: + uint64_t morph + int id + univ_pos_t pos + attr_t name + + +cdef struct MorphAnalysisC: + RichTagC tag + attr_t lemma cdef class Morphology: + cdef readonly Pool mem cdef readonly object strings cdef public object lemmatizer - cdef public object tag_map + cdef public object n_tags + cdef public object reverse_index cdef public object tag_names - cdef public object tag_ids - cdef public int n_tags - cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1 + cdef RichTagC* rich_tags + cdef PreshMapArray _cache + + cdef int assign_tag(self, TokenC* token, tag) except -1 + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 + - cdef int assign_from_dict(self, TokenC* token, props) except -1 # #cpdef enum Feature_t: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 7f6afa016..acca5eb9e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,15 +6,10 @@ try: except ImportError: import json -from spacy.parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech cimport ADJ, VERB, NOUN -cdef struct MorphAnalysisC: - uint64_t[4] features - attr_t lemma - attr_t pos - - cdef class Morphology: @classmethod def from_dir(cls, data_dir, lemmatizer=None): @@ -23,32 +18,37 @@ cdef class Morphology: lemmatizer = Lemmatizer.from_dir(data_dir) return cls(tag_map, {}, lemmatizer) - def __init__(self, tag_map, fused_tokens, lemmatizer): + def __init__(self, string_store, tag_map, lemmatizer): + self.mem = Pool() + self.strings = string_store self.lemmatizer = lemmatizer - self.tag_map = tag_map self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) - self.tag_ids = {} - for i, tag_str in enumerate(self.tag_names): - self.tag_ids[tag_str] = i - self._cache = PreshMapArray() + self.reverse_index = {} + for i, (tag_str, props) in enumerate(sorted(tag_map.items())): + self.rich_tags[i].id = i + self.rich_tags[i].name = self.strings[tag_str] + self.rich_tags[i].morph = 0 + self.reverse_index[self.rich_tags[i].name] = i + self._cache = PreshMapArray(self.n_tags) cdef int assign_tag(self, TokenC* token, tag) except -1: - analysis = self._cache.get(tag, token.lex.orth) + cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag + analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached = self.decode_tag(tag) - cached.lemma = self.lemmatize(token.pos, token.lex) + analysis.tag = self.rich_tags[tag_id] + analysis.lemma = self.lemmatize(tag, token.lex.orth) token.lemma = analysis.lemma - token.pos = analysis.pos - token.tag = analysis.tag - token.morph = analysis.features + token.pos = analysis.tag.pos + token.tag = analysis.tag.name + token.morph = analysis.tag.morph - cdef int assign_feature(self, TokenC* token, feature, value) except -1: + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: pass def load_morph_exceptions(self, dict exc): - # Map (form, pos) to (lemma, inflection) + # Map (form, pos) to (lemma, rich tag) cdef unicode pos_str cdef unicode form_str cdef unicode lemma_str @@ -57,121 +57,30 @@ cdef class Morphology: cdef int lemma cdef attr_t orth cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) + for tag_str, entries in exc.items(): + tag = self.strings[tag_str] + rich_tag = self.rich_tags[self.reverse_index[tag]] for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached.lemma = self.strings[lemma_str] - self.set_features(cached, props) - self._cache.set(pos, orth, cached) + orth = self.strings[form_str] + for name_str, value_str in props.items(): + if name_str == 'L': + cached.lemma = self.strings[value_str] + else: + self.assign_feature(&cached.tag.morph, name_str, value_str) + if cached.lemma == 0: + cached.lemma = self.lemmatize(rich_tag.pos, orth) + self._cache.set(rich_tag.pos, orth, cached) - def _load_special_tokenization(self, special_cases): - '''Add a special-case tokenization rule. - ''' - cdef int i - cdef list substrings - cdef unicode chunk - cdef unicode form - cdef unicode lemma - cdef dict props - cdef LexemeC** lexemes - cdef hash_t hashed - for chunk, substrings in sorted(special_cases.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - # Set the special tokens up to have morphology and lemmas if - # specified, otherwise use the part-of-speech tag (if specified) - form = props['F'] - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - morphology = self.vocab.morphology.decode_dict(props) - tokens[i].lemma = morph_analysis.lemma - tokens[i].pos = morph_analysis.pos - tokens[i].tag = morph_analysis.tag - tokens[i].morph = morph_analysis.morph - cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) - cached.length = len(substrings) - cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) - - - - -#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: -# morph.number = props.get('number', 0) -# morph.tenspect = props.get('tenspect', 0) -# morph.mood = props.get('mood', 0) -# morph.gender = props.get('gender', 0) -# morph.person = props.get('person', 0) -# morph.case = props.get('case', 0) -# morph.misc = props.get('misc', 0) -# -# -#cdef class Morphology: -# cdef Pool mem -# cdef PreshMap table -# -# def __init__(self, tags, exceptions): -# pass -# -# def __getitem__(self, hash_t id_): -# pass -# -# cdef const InflectionC* get(self, hash_t key) except NULL: -# pass -# -# cdef MorphAnalysis analyse(const TokenC* token) except -1: -# cdef struct MorphAnalysis morphology -# tokens[i].pos = tag.pos -# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) -# if cached is NULL: -# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) -# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) -# cached.morph = tag.morph -# self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) -# tokens[i].lemma = cached.lemma -# tokens[i].morph = cached.morph -# -# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: -# if self.lemmatizer is None: -# return lex.orth -# cdef unicode py_string = self.strings[lex.orth] -# if pos != NOUN and pos != VERB and pos != ADJ: -# return lex.orth -# cdef set lemma_strings -# cdef unicode lemma_string -# lemma_strings = self.lemmatizer(py_string, pos) -# lemma_string = sorted(lemma_strings)[0] -# lemma = self.strings[lemma_string] -# return lemma -# -# -#cdef class Inflection: -# cdef InflectionC* c -# -# def __init__(self, container, id_): -# self.c = container[id_] -# self.container = container -# -# for i, feat_id in enumerate(feat_ids): -# feature, value = parse_id(feat_id) -# self.add_value(feature, value, True) -# -# def has(self, Value_t feat_value_id): -# part = feat_value_id % 64 -# bit = feat_value_id / 64 -# if self.value_set[part] & bit: -# return True -# else: -# return False -# -# property pos: def __get__(self): return self.c.pos -# -# property id: def __get__(self): return self.c.id -# -# property features: -# pass + def lemmatize(self, const univ_pos_t pos, attr_t orth): + if self.lemmatizer is None: + return orth + cdef unicode py_string = self.strings[orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f150fa312..a0a3d65a3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -25,17 +25,6 @@ cdef struct LexemeC: float sentiment float l2_norm -cdef struct MorphFeatC: - int name - int value - - -cdef struct MorphologyC: - uint64_t[4] feature_set - MorphFeatC* features - univ_pos_t pos - int n - cdef struct Entity: int start @@ -54,8 +43,8 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - const MorphologyC* morph const Constituent* ctnt + uint64_t morph univ_pos_t pos bint spacy int tag diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index dff96e6ea..6fea4af88 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -104,7 +104,7 @@ cdef class Tagger: @classmethod def blank(cls, vocab, templates): - model = Model(vocab.morphology.n_tags, templates, model_loc=None) + model = Model(vocab.n_tags, templates, model_loc=None) return cls(vocab, model) @classmethod @@ -113,7 +113,7 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - model = Model(vocab.morphology.n_tags, templates, data_dir) + model = Model(vocab.n_tags, templates, data_dir) return cls(vocab, model) def __init__(self, Vocab vocab, model): @@ -128,7 +128,7 @@ cdef class Tagger: @property def tag_names(self): - return self.vocab.morphology.tag_names + return self.vocab.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -143,14 +143,15 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i]) + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -168,7 +169,9 @@ cdef class Tagger: for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 19b8aa026..9d60d2a6e 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -7,12 +7,7 @@ from .typedefs cimport hash_t from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc -from .vocab cimport Vocab, _Cached - - -cdef union LexemesOrTokens: - const LexemeC* const* lexemes - TokenC* tokens +from .vocab cimport Vocab, LexemesOrTokens, _Cached cdef class Tokenizer: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 38daf1c5a..d54770d2b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -192,9 +192,7 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: - pass - else: + if not cache_hit: match = self.find_infix(string) if match is None: tokens.push_back(self.vocab.get(tokens.mem, string), False) @@ -253,38 +251,10 @@ cdef class Tokenizer: cdef LexemeC** lexemes cdef hash_t hashed for chunk, substrings in sorted(special_cases.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - form = props['F'] - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - lemma = props.get('L', form) - tokens[i].lemma = self.vocab.strings[lemma] - #TODO - #self.vocab.morphology.assign_from_dict(&tokens[i], props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) - - -#if lemma is not None: -# tokens[i].lemma = self.vocab.strings[lemma] -#else: -# tokens[i].lemma = 0 -#if 'pos' in props: -# inflection = self.vocab.morphology.get(props['pos']) -# inflection.assign(&tokens[i]) -# # These are defaults, which can be over-ridden by the -# # token-specific props. -# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']] -# #tokens[i].pos = pos -# ## These are defaults, which can be over-ridden by the -# ## token-specific props. -# #set_morph_from_dict(&tokens[i].morph, morph_features) -# #if tokens[i].lemma == 0: -# # tokens[i].lemma = tokens[i].lex.orth -##set_morph_from_dict(&tokens[i].morph, props) - + cached.data.tokens = self.vocab.make_fused_token(substrings) + key = hash_string(chunk) + self._specials.set(key, cached) + self._cache.set(key, cached) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 121018770..a13858175 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr -ctypedef TokenC* TokenC_ptr +ctypedef const TokenC* const_TokenC_ptr ctypedef fused LexemeOrToken: const_Lexeme_ptr - TokenC_ptr + const_TokenC_ptr cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0fa562dfb..80facc8db 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -209,7 +209,7 @@ cdef class Doc: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - if LexemeOrToken is TokenC_ptr: + if LexemeOrToken is const_TokenC_ptr: t[0] = lex_or_tok[0] else: t.lex = lex_or_tok diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 5c88dca68..d9bf32582 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME cdef union LexemesOrTokens: const LexemeC* const* lexemes - TokenC* tokens + const TokenC* tokens cdef struct _Cached: @@ -37,6 +37,7 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index fa196166e..085fb38f9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,6 +17,7 @@ from .strings cimport hash_string from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile +from .lemmatizer import Lemmatizer from cymem.cymem cimport Address from . import util @@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - @classmethod - def default_morphology(cls): - return Morphology({'VBZ': ['VERB', {}]}, [], None) - - def __init__(self, get_lex_attr=None, morphology=None, vectors=None): - self.get_lex_attr = get_lex_attr - if morphology is None: - morphology = self.default_morphology() - self.morphology = morphology - + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + self.get_lex_attr = get_lex_attr + self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) self.length = 1 self._serializer = None @@ -60,10 +54,9 @@ cdef class Vocab: raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, - morphology=morphology) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) return self @@ -172,6 +165,22 @@ cdef class Vocab: orth = id_or_string return Lexeme(self, orth) + cdef const TokenC* make_fused_token(self, substrings) except NULL: + cdef int i + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + token = &tokens[i] + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + token.lex = self.get(self.mem, props['F']) + if 'pos' in props: + self.morphology.assign_tag(token, props['pos']) + if 'L' in props: + tokens[i].lemma = self.strings[props['L']] + for feature, value in props.get('morph', {}).items(): + self.morphology.assign_feature(&token.morph, feature, value) + return tokens + def dump(self, loc): if path.exists(loc): assert not path.isdir(loc)