From 119c0f8c3fae12dc33d3e52e282072c54d306738 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Wed, 26 Aug 2015 19:20:11 +0200
Subject: [PATCH] * Hack out morphology stuff from tokenizer, while morphology
 being reimplemented.

---
 spacy/tokenizer.pxd |  2 +-
 spacy/tokenizer.pyx | 45 +++++++++++++++++++++++++++------------------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index a7f69c5aa..19b8aa026 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 
 from .typedefs cimport hash_t
-from .structs cimport LexemeC, TokenC, Morphology
+from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
 from .tokens.doc cimport Doc
 from .vocab cimport Vocab, _Cached
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 1e857aefc..38daf1c5a 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 
-from .morphology cimport set_morph_from_dict
 from .strings cimport hash_string
 cimport cython
 
@@ -29,7 +28,7 @@ cdef class Tokenizer:
         self._suffix_re = suffix_re
         self._infix_re = infix_re
         self.vocab = vocab
-        self._load_special_tokenization(rules, self.vocab.pos_tags)
+        self._load_special_tokenization(rules)
 
     @classmethod
     def from_dir(cls, Vocab vocab, data_dir):
@@ -242,7 +241,7 @@ cdef class Tokenizer:
         match = self._suffix_re.search(string)
         return (match.end() - match.start()) if match is not None else 0
 
-    def _load_special_tokenization(self, object rules, object tag_map):
+    def _load_special_tokenization(self, special_cases):
         '''Add a special-case tokenization rule.
         '''
         cdef int i
@@ -253,25 +252,15 @@ cdef class Tokenizer:
         cdef dict props
         cdef LexemeC** lexemes
         cdef hash_t hashed
-        for chunk, substrings in sorted(rules.items()):
+        for chunk, substrings in sorted(special_cases.items()):
             tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
             for i, props in enumerate(substrings):
                 form = props['F']
-                lemma = props.get("L", None)
                 tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
-                if lemma is not None:
-                    tokens[i].lemma = self.vocab.strings[lemma]
-                else:
-                    tokens[i].lemma = 0
-                if 'pos' in props:
-                    tokens[i].tag = self.vocab.strings[props['pos']]
-                    tokens[i].pos = tag_map[props['pos']][0]
-                    # These are defaults, which can be over-ridden by the
-                    # token-specific props.
-                    set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
-                    if tokens[i].lemma == 0:
-                        tokens[i].lemma = tokens[i].lex.orth
-                set_morph_from_dict(&tokens[i].morph, props)
+                lemma = props.get('L', form)
+                tokens[i].lemma = self.vocab.strings[lemma]
+                #TODO
+                #self.vocab.morphology.assign_from_dict(&tokens[i], props)
             cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
             cached.length = len(substrings)
             cached.is_lex = False
@@ -279,3 +268,23 @@ cdef class Tokenizer:
             hashed = hash_string(chunk)
             self._specials.set(hashed, cached)
             self._cache.set(hashed, cached)
+
+
+#if lemma is not None:
+#    tokens[i].lemma = self.vocab.strings[lemma]
+#else:
+#    tokens[i].lemma = 0
+#if 'pos' in props:
+#    inflection = self.vocab.morphology.get(props['pos'])
+#    inflection.assign(&tokens[i])
+#    # These are defaults, which can be over-ridden by the
+#    # token-specific props.
+#    #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
+#    #tokens[i].pos = pos
+#    ## These are defaults, which can be over-ridden by the
+#    ## token-specific props.
+#    #set_morph_from_dict(&tokens[i].morph, morph_features)
+#    #if tokens[i].lemma == 0:
+#    #    tokens[i].lemma = tokens[i].lex.orth
+##set_morph_from_dict(&tokens[i].morph, props)
+