* Hack out morphology stuff from tokenizer, while morphology being reimplemented.

This commit is contained in:
Matthew Honnibal 2015-08-26 19:20:11 +02:00
parent b4faf551f5
commit 119c0f8c3f
2 changed files with 28 additions and 19 deletions

View File

@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology
from .structs cimport LexemeC, TokenC
from .strings cimport StringStore
from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached

View File

@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from .morphology cimport set_morph_from_dict
from .strings cimport hash_string
cimport cython
@ -29,7 +28,7 @@ cdef class Tokenizer:
self._suffix_re = suffix_re
self._infix_re = infix_re
self.vocab = vocab
self._load_special_tokenization(rules, self.vocab.pos_tags)
self._load_special_tokenization(rules)
@classmethod
def from_dir(cls, Vocab vocab, data_dir):
@ -242,7 +241,7 @@ cdef class Tokenizer:
match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, object rules, object tag_map):
def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule.
'''
cdef int i
@ -253,25 +252,15 @@ cdef class Tokenizer:
cdef dict props
cdef LexemeC** lexemes
cdef hash_t hashed
for chunk, substrings in sorted(rules.items()):
for chunk, substrings in sorted(special_cases.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings):
form = props['F']
lemma = props.get("L", None)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
if lemma is not None:
lemma = props.get('L', form)
tokens[i].lemma = self.vocab.strings[lemma]
else:
tokens[i].lemma = 0
if 'pos' in props:
tokens[i].tag = self.vocab.strings[props['pos']]
tokens[i].pos = tag_map[props['pos']][0]
# These are defaults, which can be over-ridden by the
# token-specific props.
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
if tokens[i].lemma == 0:
tokens[i].lemma = tokens[i].lex.orth
set_morph_from_dict(&tokens[i].morph, props)
#TODO
#self.vocab.morphology.assign_from_dict(&tokens[i], props)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings)
cached.is_lex = False
@ -279,3 +268,23 @@ cdef class Tokenizer:
hashed = hash_string(chunk)
self._specials.set(hashed, cached)
self._cache.set(hashed, cached)
#if lemma is not None:
# tokens[i].lemma = self.vocab.strings[lemma]
#else:
# tokens[i].lemma = 0
#if 'pos' in props:
# inflection = self.vocab.morphology.get(props['pos'])
# inflection.assign(&tokens[i])
# # These are defaults, which can be over-ridden by the
# # token-specific props.
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
# #tokens[i].pos = pos
# ## These are defaults, which can be over-ridden by the
# ## token-specific props.
# #set_morph_from_dict(&tokens[i].morph, morph_features)
# #if tokens[i].lemma == 0:
# # tokens[i].lemma = tokens[i].lex.orth
##set_morph_from_dict(&tokens[i].morph, props)