From 99bbbb6febf689250df0143394a82eb6177a5be2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Mon, 8 Dec 2014 21:12:15 +1100
Subject: [PATCH] * Work on morphological processing

---
 spacy/en.pxd        | 51 +++++++++++++++++++++++++++++++
 spacy/en.pyx        | 73 +++++++++++++++++++++++++++++++++++++++++----
 spacy/lang.pxd      |  8 +++--
 spacy/lang.pyx      | 39 +++++++++++++++++++++---
 spacy/lemmatizer.py |  3 ++
 spacy/pos_util.py   |  3 +-
 spacy/tagger.pxd    | 32 +++++++++++++++++++-
 spacy/tagger.pyx    | 42 +++++++++++++++++++++++---
 spacy/tokens.pxd    | 17 ++++++++++-
 spacy/tokens.pyx    | 14 +++++++--
 10 files changed, 261 insertions(+), 21 deletions(-)

diff --git a/spacy/en.pxd b/spacy/en.pxd
index 8ce023106..6887dbc08 100644
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@@ -5,6 +5,57 @@ from .tokens cimport Tokens
 from .tokens cimport TokenC
 
 
+cpdef enum en_person_t:
+    NO_PERSON
+    FIRST
+    SECOND
+    THIRD
+
+
+cpdef enum en_number_t:
+    NO_NUMBER
+    SINGULAR
+    PLURAL
+    MASS
+    CARDINAL
+    ORDINAL
+
+
+cpdef enum en_gender_t:
+    NO_GENDER
+    MASCULINE
+    FEMININE
+
+
+cpdef enum en_tenspect_t:
+    NO_TENSE
+    BASE_VERB
+    PRESENT
+    PAST
+    PASSIVE
+    ING
+    MODAL
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    ACCUSATIVE
+    GENITIVE
+    DEMONYM
+
+
+cpdef enum misc_t:
+    NO_MISC
+    COMPARATIVE
+    SUPERLATIVE
+    RELATIVE
+    NAME
+    URL
+    EMAIL
+    EMOTICON
+
+    
 # Flags
 cpdef enum FlagID:
     IS_ALPHA
diff --git a/spacy/en.pyx b/spacy/en.pyx
index c0eb0368b..fa59ef933 100644
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@@ -35,6 +35,63 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
+from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .tagger cimport X, PUNCT, EOL
+
+
+POS_TAGS = {
+    'NULL': (NO_TAG, {}),
+    'EOL': (EOL, {}),
+    'CC': (CONJ, {}),
+    'CD': (NUM, {}),
+    'DT': (DET, {}),
+    'EX': (DET, {}),
+    'FW': (X, {}),
+    'IN': (ADP, {}),
+    'JJ': (ADJ, {}),
+    'JJR': (ADJ, {'misc': COMPARATIVE}),
+    'JJS': (ADJ, {'misc': SUPERLATIVE}),
+    'LS': (X, {}),
+    'MD': (VERB, {'tenspect': MODAL}),
+    'NN': (NOUN, {}),
+    'NNS': (NOUN, {'number': PLURAL}),
+    'NNP': (NOUN, {'misc': NAME}),
+    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
+    'PDT': (DET, {}),
+    'POS': (PRT, {'case': GENITIVE}),
+    'PRP': (NOUN, {}),
+    'PRP$': (NOUN, {'case': GENITIVE}),
+    'RB': (ADV, {}),
+    'RBR': (ADV, {'misc': COMPARATIVE}),
+    'RBS': (ADV, {'misc': SUPERLATIVE}),
+    'RP': (PRT, {}),
+    'SYM': (X, {}),
+    'TO': (PRT, {}),
+    'UH': (X, {}),
+    'VB': (VERB, {}),
+    'VBD': (VERB, {'tenspect': PAST}),
+    'VBG': (VERB, {'tenspect': ING}),
+    'VBN': (VERB, {'tenspect': PASSIVE}),
+    'VBP': (VERB, {'tenspect': PRESENT}),
+    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
+    'WDT': (DET, {'misc': RELATIVE}),
+    'WP': (PRON, {'misc': RELATIVE}),
+    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
+    'WRB': (ADV, {'misc': RELATIVE}),
+    '!': (PUNCT, {}),
+    '#': (PUNCT, {}),
+    '$': (PUNCT, {}),
+    "''": (PUNCT, {}),
+    "(": (PUNCT, {}),
+    ")": (PUNCT, {}),
+    "-LRB-": (PUNCT, {}),
+    "-RRB-": (PUNCT, {}),
+    ".": (PUNCT, {}),
+    ",": (PUNCT, {}),
+    "``": (PUNCT, {}),
+    ":": (PUNCT, {}),
+    "?": (PUNCT, {}),
+}
 
 
 POS_TEMPLATES = (
@@ -91,19 +148,25 @@ cdef class English(Language):
     def set_pos(self, Tokens tokens):
         cdef int i
         cdef atom_t[N_CONTEXT_FIELDS] context
+        cdef TokenC* t = tokens.data
         for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
-            tokens.data[i].pos = self.pos_tagger.predict(context)
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context)
+            #self.morphalyser.set_token(&t[i])
 
     def train_pos(self, Tokens tokens, golds):
         cdef int i
         cdef atom_t[N_CONTEXT_FIELDS] context
         c = 0
+        cdef TokenC* t = tokens.data
         for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
-            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            c += tokens.data[i].pos == golds[i]
+            fill_pos_context(context, i, t)
+            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
+            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            c += t[i].pos == golds[i]
         return c
 
 
+
 EN = English('en')
diff --git a/spacy/lang.pxd b/spacy/lang.pxd
index 20374f40d..124281a6b 100644
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@@ -2,20 +2,20 @@ from libcpp.vector cimport vector
 
 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
 
-from preshed.maps cimport PreshMap
+from preshed.maps cimport PreshMap, PreshMapArray
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
+from .tagger cimport PosTag
 from .utf8string cimport StringStore, UniStr
 
 
 cdef class Lexicon:
     cpdef public get_lex_props
     cdef Pool mem
-    cpdef readonly size_t size
     cpdef readonly StringStore strings
     cdef vector[Lexeme*] lexemes
 
@@ -29,13 +29,17 @@ cdef class Language:
     cdef readonly unicode name
     cdef PreshMap _cache
     cdef PreshMap _specials
+    cdef PreshMapArray _lemmas
     cpdef readonly Lexicon lexicon
     cpdef readonly Tagger pos_tagger
+    cpdef readonly object lemmatizer
 
     cdef object _prefix_re
     cdef object _suffix_re
     cdef object _infix_re
 
+    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+
     cpdef Tokens tokens_from_list(self, list strings)
     cpdef Tokens tokenize(self, unicode text)
 
diff --git a/spacy/lang.pyx b/spacy/lang.pyx
index 496c6742c..fdeb7df66 100644
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
+from .lemmatizer import Lemmatizer
 
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@@ -26,6 +27,8 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 
+from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
+
 
 cdef class Language:
     def __init__(self, name):
@@ -39,14 +42,40 @@ cdef class Language:
         self._infix_re = re.compile(infix)
         self.lexicon = Lexicon(self.get_props)
         self._load_special_tokenization(rules)
+        self._lemmas = PreshMapArray(N_UNIV_TAGS)
         self.pos_tagger = None
+        self.lemmatizer = None
 
     def load(self):
+        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
         self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
         self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
         if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
             self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
 
+    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+        if self.lemmatizer is None:
+            return lex.sic
+        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+            return lex.sic
+        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        if lemma != 0:
+            return lemma
+        cdef bytes py_string = self.lexicon.strings[lex.sic]
+        cdef set lemma_strings
+        cdef bytes lemma_string
+        if pos.pos == NOUN:
+            lemma_strings = self.lemmatizer.noun(py_string)
+        elif pos.pos == VERB:
+            lemma_strings = self.lemmatizer.verb(py_string)
+        else:
+            assert pos.pos == ADJ
+            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
+        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        return lemma
+
     cpdef Tokens tokens_from_list(self, list strings):
         cdef int length = sum([len(s) for s in strings])
         cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@@ -254,9 +283,11 @@ cdef class Lexicon:
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
-        self.size = 2
         self.get_lex_props = get_props
 
+    def __len__(self):
+        return self.lexemes.size()
+
     cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
         '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
         if necessary, using memory acquired from the given pool.  If the pool
@@ -269,14 +300,13 @@ cdef class Lexicon:
             mem = self.mem
         cdef unicode py_string = string.chars[:string.n]
         lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
+        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
                              self.get_lex_props(py_string))
         if mem is self.mem:
             self._map.set(string.key, lex)
             while self.lexemes.size() < (lex.id + 1):
                 self.lexemes.push_back(&EMPTY_LEXEME)
             self.lexemes[lex.id] = lex
-            self.size += 1
         else:
             lex[0].id = 1
         return lex
@@ -302,6 +332,8 @@ cdef class Lexicon:
                 a dict if the operator is called from Python.
         '''
         if type(id_or_string) == int:
+            if id_or_string >= self.lexemes.size():
+                raise IndexError
             return self.lexemes.at(id_or_string)[0]
         cdef UniStr string
         slice_unicode(&string, id_or_string, 0, len(id_or_string))
@@ -359,5 +391,4 @@ cdef class Lexicon:
                 self.lexemes.push_back(&EMPTY_LEXEME)
             self.lexemes[lexeme.id] = lexeme
             i += 1
-            self.size += 1
         fclose(fp)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index a42a5daee..ce9bbefdc 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -53,6 +53,7 @@ class Lemmatizer(object):
 
 
 def lemmatize(string, index, exceptions, rules):
+    string = string.lower()
     forms = []
     if string in index:
         forms.append(string)
@@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
             form = string[:len(string) - len(old)] + new
             if form in index:
                 forms.append(form)
+    if not forms:
+        forms.append(string)
     return set(forms)
 
 
diff --git a/spacy/pos_util.py b/spacy/pos_util.py
index e5716665e..489f03dde 100644
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@@ -147,6 +147,7 @@ Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
-``	.""".strip().split('\n'))
+``	.
+EOL EOL""".strip().split('\n'))
     return mapping[tag]
 
diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd
index f91bbeb0a..11880bf13 100644
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@@ -1,11 +1,40 @@
+from libc.stdint cimport uint8_t
+
 from cymem.cymem cimport Pool
 
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 
+from preshed.maps cimport PreshMapArray
+
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, Morphology
+
+
+# Google universal tag set
+cdef enum univ_tag_t:
+    NO_TAG
+    ADJ
+    ADV
+    ADP
+    CONJ
+    DET
+    NOUN
+    NUM
+    PRON
+    PRT
+    VERB
+    X
+    PUNCT
+    EOL
+    N_UNIV_TAGS
+
+
+cdef struct PosTag:
+    Morphology morph
+    int id
+    univ_tag_t pos
 
 
 cdef class Tagger:
@@ -16,4 +45,5 @@ cdef class Tagger:
     cpdef readonly LinearModel model
 
     cpdef readonly list tag_names
+    cdef PosTag* tags
     cdef dict tagdict
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 22ec3896a..db7974d91 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats
 
 
-def setup_model_dir(tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
     os.mkdir(model_dir)
     config = {
         'templates': templates,
         'tag_names': tag_names,
+        'tag_map': tag_map,
         'tag_counts': tag_counts,
     }
     with open(path.join(model_dir, 'config.json'), 'w') as file_:
@@ -33,16 +34,31 @@ cdef class Tagger:
         self.mem = Pool()
         cfg = json.load(open(path.join(model_dir, 'config.json')))
         templates = cfg['templates']
+        tag_map = cfg['tag_map']
+        univ_counts = {}
+        cdef unicode tag
+        cdef unicode univ_tag
         self.tag_names = cfg['tag_names']
+        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
+        for i, tag in enumerate(self.tag_names):
+            pos, props = tag_map[tag]
+            self.tags[i].id = i
+            self.tags[i].pos = pos
+            self.tags[i].morph.number = props.get('number', 0)
+            self.tags[i].morph.tenspect = props.get('tenspect', 0)
+            self.tags[i].morph.mood = props.get('mood', 0)
+            self.tags[i].morph.gender = props.get('gender', 0)
+            self.tags[i].morph.person = props.get('person', 0)
+            self.tags[i].morph.case = props.get('case', 0)
+            self.tags[i].morph.misc = props.get('misc', 0)
         self.tagdict = _make_tag_dict(cfg['tag_counts'])
         self.extractor = Extractor(templates)
         self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
         if path.exists(path.join(model_dir, 'model')):
             self.model.load(path.join(model_dir, 'model'))
 
-    cdef class_t predict(self, const atom_t* context, object golds=None) except *:
-        """Predict the tag of tokens[i].  The tagger remembers the features and
-        prediction, in case you later call tell_answer.
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
+        """Predict the tag of tokens[i].
 
         >>> tokens = EN.tokenize(u'An example sentence.')
         >>> tag = EN.pos_tagger.predict(0, tokens)
@@ -69,6 +85,24 @@ cdef class Tagger:
         return tag_id
 
 
+UNIV_TAGS = {
+    'NULL': NO_TAG,
+    'ADJ': ADJ,
+    'ADV': ADV,
+    'ADP': ADP,
+    'CONJ': CONJ,
+    'DET': DET,
+    'NOUN': NOUN,
+    'NUM': NUM,
+    'PRON': PRON,
+    'PRT': PRT,
+    'VERB': VERB,
+    'X': X,
+    '.': PUNCT,
+    'EOL': EOL
+}
+
+
 def _make_tag_dict(counts):
     freq_thresh = 50
     ambiguity_thresh = 0.98
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index e6bc0a46a..6f4691716 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
 
 from .lexeme cimport Lexeme
+
 from .typedefs cimport flags_t
 from .utf8string cimport StringStore
+from libc.stdint cimport uint8_t, uint16_t
+
+
+cdef struct Morphology:
+    uint8_t number
+    uint8_t tenspect # Tense/aspect/voice
+    uint8_t mood
+    uint8_t gender
+    uint8_t person
+    uint8_t case
+    uint8_t misc
+
 
 
 cdef struct TokenC:
     const Lexeme* lex
+    Morphology morph
     int idx
     int pos
+    int lemma
     int sense
 
 
@@ -37,7 +52,7 @@ cdef class Token:
     cdef public int i
     cdef public int idx
     cdef public int pos
-    cdef public int ner
+    cdef int lemma
 
     cdef public atom_t id
     cdef public atom_t cluster
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 33f265eef..004d0578c 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -51,7 +51,7 @@ cdef class Tokens:
     def __getitem__(self, i):
         bounds_check(i, self.length, PADDING)
         return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
-                     self.data[i].sense, self.data[i].lex[0])
+                     self.data[i].lemma, self.data[i].lex[0])
 
     def __iter__(self):
         for i in range(self.length):
@@ -128,14 +128,15 @@ cdef class Tokens:
 
 @cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
+    def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
                  dict lex):
         self._string_store = string_store
         self.idx = idx
         self.pos = pos
-        self.ner = ner
         self.i = i
         self.id = lex['id']
+
+        self.lemma = lemma
         
         self.cluster = lex['cluster']
         self.length = lex['length']
@@ -156,3 +157,10 @@ cdef class Token:
                 return ''
             cdef bytes utf8string = self._string_store[self.sic]
             return utf8string.decode('utf8')
+
+    property lemma:
+        def __get__(self):
+            if self.lemma == 0:
+                return self.string
+            cdef bytes utf8string = self._string_store[self.lemma]
+            return utf8string.decode('utf8')