* Move morphological analysis into its own module, morphology.pyx

2014-12-09 21:16:17 +11:00 · 2014-12-09 21:16:17 +11:00 · 6b34a2f34b
parent b962fe73d7
commit 6b34a2f34b
7 changed files with 135 additions and 97 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -35,8 +35,8 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
-from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
-from .tagger cimport X, PUNCT, EOL
+from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .morphology cimport X, PUNCT, EOL

 from .tokens cimport Morphology

@ -154,8 +154,8 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context)
-            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
-            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
+            if self.morphologizer:
+                self.morphologizer.set_morph(i, t)

    def train_pos(self, Tokens tokens, golds):
        cdef int i
@ -165,8 +165,8 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
-            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
+            if self.morphologizer:
+                self.morphologizer.set_morph(i, t)
            c += t[i].pos == golds[i]
        return c

--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -2,15 +2,15 @@ from libcpp.vector cimport vector

 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER

-from preshed.maps cimport PreshMap, PreshMapArray
+from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool

 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
+from .morphology cimport Morphologizer


 cdef union LexemesOrTokens:
@ -40,17 +40,14 @@ cdef class Language:
    cdef readonly unicode name
    cdef PreshMap _cache
    cdef PreshMap _specials
-    cdef PreshMapArray _lemmas
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
-    cpdef readonly object lemmatizer
+    cpdef readonly Morphologizer morphologizer

    cdef object _prefix_re
    cdef object _suffix_re
    cdef object _infix_re

-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
-
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)

--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
-from .lemmatizer import Lemmatizer

 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
-
-from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
 from .tokens cimport Morphology


@ -43,39 +40,16 @@ cdef class Language:
        self._infix_re = re.compile(infix)
        self.lexicon = Lexicon(self.get_props)
        self._load_special_tokenization(rules)
-        self._lemmas = PreshMapArray(N_UNIV_TAGS)
        self.pos_tagger = None
-        self.lemmatizer = None
+        self.morphologizer = None

    def load(self):
-        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
-
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
-        if self.lemmatizer is None:
-            return lex.sic
-        if pos != NOUN and pos != VERB and pos != ADJ:
-            return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
-        if lemma != 0:
-            return lemma
-        cdef bytes py_string = self.lexicon.strings[lex.sic]
-        cdef set lemma_strings
-        cdef bytes lemma_string
-        if pos == NOUN:
-            lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos == VERB:
-            lemma_strings = self.lemmatizer.verb(py_string)
-        else:
-            assert pos == ADJ
-            lemma_strings = self.lemmatizer.adj(py_string)
-        lemma_string = sorted(lemma_strings)[0]
-        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos, lex.sic, <void*>lemma)
-        return lemma
+            self.morphologizer = Morphologizer(self.lexicon.strings,
+                                    path.join(util.DATA_DIR, self.name))

    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -0,0 +1,42 @@
+from .tokens cimport TokenC, Morphology
+from .lexeme cimport Lexeme
+from .utf8string cimport StringStore
+
+from preshed.maps cimport PreshMapArray
+from cymem.cymem cimport Pool
+
+# Google universal tag set
+cpdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
+cdef struct PosTag:
+    Morphology morph
+    int id
+    univ_tag_t pos
+
+
+cdef class Morphologizer:
+    cdef Pool mem
+    cdef StringStore strings
+    cdef object lemmatizer
+    cdef PosTag* tags
+
+    cdef PreshMapArray _morph
+    cdef PreshMapArray _lemmas
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
+    cdef int set_morph(self, const int i, TokenC* tokens) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -0,0 +1,81 @@
+from os import path
+import json
+
+from .lemmatizer import Lemmatizer
+
+
+UNIV_TAGS = {
+    'NULL': NO_TAG,
+    'ADJ': ADJ,
+    'ADV': ADV,
+    'ADP': ADP,
+    'CONJ': CONJ,
+    'DET': DET,
+    'NOUN': NOUN,
+    'NUM': NUM,
+    'PRON': PRON,
+    'PRT': PRT,
+    'VERB': VERB,
+    'X': X,
+    '.': PUNCT,
+    'EOL': EOL
+}
+
+
+cdef class Morphologizer:
+    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
+    """
+    def __init__(self, StringStore strings, data_dir):
+        self.mem = Pool()
+        self.strings = strings
+        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
+        tag_map = cfg['tag_map']
+        tag_names = cfg['tag_names']
+        self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
+        self._lemmas = PreshMapArray(N_UNIV_TAGS)
+        self._morph = PreshMapArray(len(tag_names))
+        self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
+        for i, tag in enumerate(tag_names):
+            pos, props = tag_map[tag]
+            self.tags[i].id = i
+            self.tags[i].pos = pos
+            self.tags[i].morph.number = props.get('number', 0)
+            self.tags[i].morph.tenspect = props.get('tenspect', 0)
+            self.tags[i].morph.mood = props.get('mood', 0)
+            self.tags[i].morph.gender = props.get('gender', 0)
+            self.tags[i].morph.person = props.get('person', 0)
+            self.tags[i].morph.case = props.get('case', 0)
+            self.tags[i].morph.misc = props.get('misc', 0)
+
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+        if self.lemmatizer is None:
+            return lex.sic
+        if pos != NOUN and pos != VERB and pos != ADJ:
+            return lex.sic
+        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
+        if lemma != 0:
+            return lemma
+        cdef bytes py_string = self.strings[lex.sic]
+        cdef set lemma_strings
+        cdef bytes lemma_string
+        if pos == NOUN:
+            lemma_strings = self.lemmatizer.noun(py_string)
+        elif pos == VERB:
+            lemma_strings = self.lemmatizer.verb(py_string)
+        else:
+            assert pos == ADJ
+            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
+        self._lemmas.set(pos, lex.sic, <void*>lemma)
+        return lemma
+
+    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
+        cdef const PosTag* tag = &self.tags[tokens[i].pos]
+        tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
+        morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
+        if morph is NULL:
+            self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
+            tokens[i].morph = tag.morph
+        else:
+            tokens[i].morph = morph[0]
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -12,31 +12,6 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, Morphology


-# Google universal tag set
-cdef enum univ_tag_t:
-    NO_TAG
-    ADJ
-    ADV
-    ADP
-    CONJ
-    DET
-    NOUN
-    NUM
-    PRON
-    PRT
-    VERB
-    X
-    PUNCT
-    EOL
-    N_UNIV_TAGS
-
-
-cdef struct PosTag:
-    Morphology morph
-    int id
-    univ_tag_t pos
-
-
 cdef class Tagger:
    cdef class_t predict(self, const atom_t* context, object golds=*) except *
 
@ -45,5 +20,4 @@ cdef class Tagger:
    cpdef readonly LinearModel model

    cpdef readonly list tag_names
-    cdef PosTag* tags
    cdef dict tagdict
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -34,23 +34,10 @@ cdef class Tagger:
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
-        tag_map = cfg['tag_map']
        univ_counts = {}
        cdef unicode tag
        cdef unicode univ_tag
        self.tag_names = cfg['tag_names']
-        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
-        for i, tag in enumerate(self.tag_names):
-            pos, props = tag_map[tag]
-            self.tags[i].id = i
-            self.tags[i].pos = pos
-            self.tags[i].morph.number = props.get('number', 0)
-            self.tags[i].morph.tenspect = props.get('tenspect', 0)
-            self.tags[i].morph.mood = props.get('mood', 0)
-            self.tags[i].morph.gender = props.get('gender', 0)
-            self.tags[i].morph.person = props.get('person', 0)
-            self.tags[i].morph.case = props.get('case', 0)
-            self.tags[i].morph.misc = props.get('misc', 0)
        self.tagdict = _make_tag_dict(cfg['tag_counts'])
        self.extractor = Extractor(templates)
        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
@ -85,23 +72,6 @@ cdef class Tagger:
        return tag_id


-UNIV_TAGS = {
-    'NULL': NO_TAG,
-    'ADJ': ADJ,
-    'ADV': ADV,
-    'ADP': ADP,
-    'CONJ': CONJ,
-    'DET': DET,
-    'NOUN': NOUN,
-    'NUM': NUM,
-    'PRON': PRON,
-    'PRT': PRT,
-    'VERB': VERB,
-    'X': X,
-    '.': PUNCT,
-    'EOL': EOL
-}
-

 def _make_tag_dict(counts):
    freq_thresh = 50