* Work on morphological processing

2014-12-08 21:12:15 +11:00 · 2014-12-08 21:12:15 +11:00 · 99bbbb6feb
parent 7b68f911cf
commit 99bbbb6feb
10 changed files with 261 additions and 21 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -5,6 +5,57 @@ from .tokens cimport Tokens
 from .tokens cimport TokenC


+cpdef enum en_person_t:
+    NO_PERSON
+    FIRST
+    SECOND
+    THIRD
+
+
+cpdef enum en_number_t:
+    NO_NUMBER
+    SINGULAR
+    PLURAL
+    MASS
+    CARDINAL
+    ORDINAL
+
+
+cpdef enum en_gender_t:
+    NO_GENDER
+    MASCULINE
+    FEMININE
+
+
+cpdef enum en_tenspect_t:
+    NO_TENSE
+    BASE_VERB
+    PRESENT
+    PAST
+    PASSIVE
+    ING
+    MODAL
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    ACCUSATIVE
+    GENITIVE
+    DEMONYM
+
+
+cpdef enum misc_t:
+    NO_MISC
+    COMPARATIVE
+    SUPERLATIVE
+    RELATIVE
+    NAME
+    URL
+    EMAIL
+    EMOTICON
+
+    
 # Flags
 cpdef enum FlagID:
    IS_ALPHA
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -35,6 +35,63 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
+from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .tagger cimport X, PUNCT, EOL
+
+
+POS_TAGS = {
+    'NULL': (NO_TAG, {}),
+    'EOL': (EOL, {}),
+    'CC': (CONJ, {}),
+    'CD': (NUM, {}),
+    'DT': (DET, {}),
+    'EX': (DET, {}),
+    'FW': (X, {}),
+    'IN': (ADP, {}),
+    'JJ': (ADJ, {}),
+    'JJR': (ADJ, {'misc': COMPARATIVE}),
+    'JJS': (ADJ, {'misc': SUPERLATIVE}),
+    'LS': (X, {}),
+    'MD': (VERB, {'tenspect': MODAL}),
+    'NN': (NOUN, {}),
+    'NNS': (NOUN, {'number': PLURAL}),
+    'NNP': (NOUN, {'misc': NAME}),
+    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
+    'PDT': (DET, {}),
+    'POS': (PRT, {'case': GENITIVE}),
+    'PRP': (NOUN, {}),
+    'PRP$': (NOUN, {'case': GENITIVE}),
+    'RB': (ADV, {}),
+    'RBR': (ADV, {'misc': COMPARATIVE}),
+    'RBS': (ADV, {'misc': SUPERLATIVE}),
+    'RP': (PRT, {}),
+    'SYM': (X, {}),
+    'TO': (PRT, {}),
+    'UH': (X, {}),
+    'VB': (VERB, {}),
+    'VBD': (VERB, {'tenspect': PAST}),
+    'VBG': (VERB, {'tenspect': ING}),
+    'VBN': (VERB, {'tenspect': PASSIVE}),
+    'VBP': (VERB, {'tenspect': PRESENT}),
+    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
+    'WDT': (DET, {'misc': RELATIVE}),
+    'WP': (PRON, {'misc': RELATIVE}),
+    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
+    'WRB': (ADV, {'misc': RELATIVE}),
+    '!': (PUNCT, {}),
+    '#': (PUNCT, {}),
+    '$': (PUNCT, {}),
+    "''": (PUNCT, {}),
+    "(": (PUNCT, {}),
+    ")": (PUNCT, {}),
+    "-LRB-": (PUNCT, {}),
+    "-RRB-": (PUNCT, {}),
+    ".": (PUNCT, {}),
+    ",": (PUNCT, {}),
+    "``": (PUNCT, {}),
+    ":": (PUNCT, {}),
+    "?": (PUNCT, {}),
+}


 POS_TEMPLATES = (
@ -91,19 +148,25 @@ cdef class English(Language):
    def set_pos(self, Tokens tokens):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
+        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
-            tokens.data[i].pos = self.pos_tagger.predict(context)
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context)
+            #self.morphalyser.set_token(&t[i])

    def train_pos(self, Tokens tokens, golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        c = 0
+        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
-            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            c += tokens.data[i].pos == golds[i]
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
+            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            c += t[i].pos == golds[i]
        return c


+
 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -2,20 +2,20 @@ from libcpp.vector cimport vector

 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER

-from preshed.maps cimport PreshMap
+from preshed.maps cimport PreshMap, PreshMapArray
 from cymem.cymem cimport Pool

 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
+from .tagger cimport PosTag
 from .utf8string cimport StringStore, UniStr


 cdef class Lexicon:
    cpdef public get_lex_props
    cdef Pool mem
-    cpdef readonly size_t size
    cpdef readonly StringStore strings
    cdef vector[Lexeme*] lexemes

@ -29,13 +29,17 @@ cdef class Language:
    cdef readonly unicode name
    cdef PreshMap _cache
    cdef PreshMap _specials
+    cdef PreshMapArray _lemmas
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
+    cpdef readonly object lemmatizer

    cdef object _prefix_re
    cdef object _suffix_re
    cdef object _infix_re

+    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)

--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
+from .lemmatizer import Lemmatizer

 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@ -26,6 +27,8 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens

+from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
+

 cdef class Language:
    def __init__(self, name):
@ -39,14 +42,40 @@ cdef class Language:
        self._infix_re = re.compile(infix)
        self.lexicon = Lexicon(self.get_props)
        self._load_special_tokenization(rules)
+        self._lemmas = PreshMapArray(N_UNIV_TAGS)
        self.pos_tagger = None
+        self.lemmatizer = None

    def load(self):
+        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))

+    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+        if self.lemmatizer is None:
+            return lex.sic
+        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+            return lex.sic
+        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        if lemma != 0:
+            return lemma
+        cdef bytes py_string = self.lexicon.strings[lex.sic]
+        cdef set lemma_strings
+        cdef bytes lemma_string
+        if pos.pos == NOUN:
+            lemma_strings = self.lemmatizer.noun(py_string)
+        elif pos.pos == VERB:
+            lemma_strings = self.lemmatizer.verb(py_string)
+        else:
+            assert pos.pos == ADJ
+            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
+        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        return lemma
+
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@ -254,9 +283,11 @@ cdef class Lexicon:
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
-        self.size = 2
        self.get_lex_props = get_props

+    def __len__(self):
+        return self.lexemes.size()
+
    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
@ -269,14 +300,13 @@ cdef class Lexicon:
            mem = self.mem
        cdef unicode py_string = string.chars[:string.n]
        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
+        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
                             self.get_lex_props(py_string))
        if mem is self.mem:
            self._map.set(string.key, lex)
            while self.lexemes.size() < (lex.id + 1):
                self.lexemes.push_back(&EMPTY_LEXEME)
            self.lexemes[lex.id] = lex
-            self.size += 1
        else:
            lex[0].id = 1
        return lex
@ -302,6 +332,8 @@ cdef class Lexicon:
                a dict if the operator is called from Python.
        '''
        if type(id_or_string) == int:
+            if id_or_string >= self.lexemes.size():
+                raise IndexError
            return self.lexemes.at(id_or_string)[0]
        cdef UniStr string
        slice_unicode(&string, id_or_string, 0, len(id_or_string))
@ -359,5 +391,4 @@ cdef class Lexicon:
                self.lexemes.push_back(&EMPTY_LEXEME)
            self.lexemes[lexeme.id] = lexeme
            i += 1
-            self.size += 1
        fclose(fp)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -53,6 +53,7 @@ class Lemmatizer(object):


 def lemmatize(string, index, exceptions, rules):
+    string = string.lower()
    forms = []
    if string in index:
        forms.append(string)
@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
            form = string[:len(string) - len(old)] + new
            if form in index:
                forms.append(form)
+    if not forms:
+        forms.append(string)
    return set(forms)


--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -147,6 +147,7 @@ Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
-``	.""".strip().split('\n'))
+``	.
+EOL EOL""".strip().split('\n'))
    return mapping[tag]

--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -1,11 +1,40 @@
+from libc.stdint cimport uint8_t
+
 from cymem.cymem cimport Pool

 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t

+from preshed.maps cimport PreshMapArray
+
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, Morphology
+
+
+# Google universal tag set
+cdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
+cdef struct PosTag:
+    Morphology morph
+    int id
+    univ_tag_t pos


 cdef class Tagger:
@ -16,4 +45,5 @@ cdef class Tagger:
    cpdef readonly LinearModel model

    cpdef readonly list tag_names
+    cdef PosTag* tags
    cdef dict tagdict
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats


-def setup_model_dir(tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'templates': templates,
        'tag_names': tag_names,
+        'tag_map': tag_map,
        'tag_counts': tag_counts,
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
@ -33,16 +34,31 @@ cdef class Tagger:
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
+        tag_map = cfg['tag_map']
+        univ_counts = {}
+        cdef unicode tag
+        cdef unicode univ_tag
        self.tag_names = cfg['tag_names']
+        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
+        for i, tag in enumerate(self.tag_names):
+            pos, props = tag_map[tag]
+            self.tags[i].id = i
+            self.tags[i].pos = pos
+            self.tags[i].morph.number = props.get('number', 0)
+            self.tags[i].morph.tenspect = props.get('tenspect', 0)
+            self.tags[i].morph.mood = props.get('mood', 0)
+            self.tags[i].morph.gender = props.get('gender', 0)
+            self.tags[i].morph.person = props.get('person', 0)
+            self.tags[i].morph.case = props.get('case', 0)
+            self.tags[i].morph.misc = props.get('misc', 0)
        self.tagdict = _make_tag_dict(cfg['tag_counts'])
        self.extractor = Extractor(templates)
        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))

-    cdef class_t predict(self, const atom_t* context, object golds=None) except *:
-        """Predict the tag of tokens[i].  The tagger remembers the features and
-        prediction, in case you later call tell_answer.
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
+        """Predict the tag of tokens[i].

        >>> tokens = EN.tokenize(u'An example sentence.')
        >>> tag = EN.pos_tagger.predict(0, tokens)
@ -69,6 +85,24 @@ cdef class Tagger:
        return tag_id


+UNIV_TAGS = {
+    'NULL': NO_TAG,
+    'ADJ': ADJ,
+    'ADV': ADV,
+    'ADP': ADP,
+    'CONJ': CONJ,
+    'DET': DET,
+    'NOUN': NOUN,
+    'NUM': NUM,
+    'PRON': PRON,
+    'PRT': PRT,
+    'VERB': VERB,
+    'X': X,
+    '.': PUNCT,
+    'EOL': EOL
+}
+
+
 def _make_tag_dict(counts):
    freq_thresh = 50
    ambiguity_thresh = 0.98
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t

 from .lexeme cimport Lexeme
+
 from .typedefs cimport flags_t
 from .utf8string cimport StringStore
+from libc.stdint cimport uint8_t, uint16_t
+
+
+cdef struct Morphology:
+    uint8_t number
+    uint8_t tenspect # Tense/aspect/voice
+    uint8_t mood
+    uint8_t gender
+    uint8_t person
+    uint8_t case
+    uint8_t misc
+


 cdef struct TokenC:
    const Lexeme* lex
+    Morphology morph
    int idx
    int pos
+    int lemma
    int sense


@ -37,7 +52,7 @@ cdef class Token:
    cdef public int i
    cdef public int idx
    cdef public int pos
-    cdef public int ner
+    cdef int lemma

    cdef public atom_t id
    cdef public atom_t cluster
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -51,7 +51,7 @@ cdef class Tokens:
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
        return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
-                     self.data[i].sense, self.data[i].lex[0])
+                     self.data[i].lemma, self.data[i].lex[0])

    def __iter__(self):
        for i in range(self.length):
@ -128,14 +128,15 @@ cdef class Tokens:

@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
+    def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
                 dict lex):
        self._string_store = string_store
        self.idx = idx
        self.pos = pos
-        self.ner = ner
        self.i = i
        self.id = lex['id']
+
+        self.lemma = lemma
        
        self.cluster = lex['cluster']
        self.length = lex['length']
@ -156,3 +157,10 @@ cdef class Token:
                return ''
            cdef bytes utf8string = self._string_store[self.sic]
            return utf8string.decode('utf8')
+
+    property lemma:
+        def __get__(self):
+            if self.lemma == 0:
+                return self.string
+            cdef bytes utf8string = self._string_store[self.lemma]
+            return utf8string.decode('utf8')