* Hack out morphology stuff from tokenizer, while morphology being reimplemented.

This commit is contained in:
Matthew Honnibal 2015-08-26 19:20:11 +02:00
parent b4faf551f5
commit 119c0f8c3f
2 changed files with 28 additions and 19 deletions

View File

@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology from .structs cimport LexemeC, TokenC
from .strings cimport StringStore from .strings cimport StringStore
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .vocab cimport Vocab, _Cached from .vocab cimport Vocab, _Cached

View File

@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap from preshed.maps cimport PreshMap
from .morphology cimport set_morph_from_dict
from .strings cimport hash_string from .strings cimport hash_string
cimport cython cimport cython
@ -29,7 +28,7 @@ cdef class Tokenizer:
self._suffix_re = suffix_re self._suffix_re = suffix_re
self._infix_re = infix_re self._infix_re = infix_re
self.vocab = vocab self.vocab = vocab
self._load_special_tokenization(rules, self.vocab.pos_tags) self._load_special_tokenization(rules)
@classmethod @classmethod
def from_dir(cls, Vocab vocab, data_dir): def from_dir(cls, Vocab vocab, data_dir):
@ -242,7 +241,7 @@ cdef class Tokenizer:
match = self._suffix_re.search(string) match = self._suffix_re.search(string)
return (match.end() - match.start()) if match is not None else 0 return (match.end() - match.start()) if match is not None else 0
def _load_special_tokenization(self, object rules, object tag_map): def _load_special_tokenization(self, special_cases):
'''Add a special-case tokenization rule. '''Add a special-case tokenization rule.
''' '''
cdef int i cdef int i
@ -253,25 +252,15 @@ cdef class Tokenizer:
cdef dict props cdef dict props
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef hash_t hashed cdef hash_t hashed
for chunk, substrings in sorted(rules.items()): for chunk, substrings in sorted(special_cases.items()):
tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
for i, props in enumerate(substrings): for i, props in enumerate(substrings):
form = props['F'] form = props['F']
lemma = props.get("L", None)
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form) tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
if lemma is not None: lemma = props.get('L', form)
tokens[i].lemma = self.vocab.strings[lemma] tokens[i].lemma = self.vocab.strings[lemma]
else: #TODO
tokens[i].lemma = 0 #self.vocab.morphology.assign_from_dict(&tokens[i], props)
if 'pos' in props:
tokens[i].tag = self.vocab.strings[props['pos']]
tokens[i].pos = tag_map[props['pos']][0]
# These are defaults, which can be over-ridden by the
# token-specific props.
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
if tokens[i].lemma == 0:
tokens[i].lemma = tokens[i].lex.orth
set_morph_from_dict(&tokens[i].morph, props)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)
cached.is_lex = False cached.is_lex = False
@ -279,3 +268,23 @@ cdef class Tokenizer:
hashed = hash_string(chunk) hashed = hash_string(chunk)
self._specials.set(hashed, cached) self._specials.set(hashed, cached)
self._cache.set(hashed, cached) self._cache.set(hashed, cached)
#if lemma is not None:
# tokens[i].lemma = self.vocab.strings[lemma]
#else:
# tokens[i].lemma = 0
#if 'pos' in props:
# inflection = self.vocab.morphology.get(props['pos'])
# inflection.assign(&tokens[i])
# # These are defaults, which can be over-ridden by the
# # token-specific props.
# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
# #tokens[i].pos = pos
# ## These are defaults, which can be over-ridden by the
# ## token-specific props.
# #set_morph_from_dict(&tokens[i].morph, morph_features)
# #if tokens[i].lemma == 0:
# # tokens[i].lemma = tokens[i].lex.orth
##set_morph_from_dict(&tokens[i].morph, props)