* Fix Issue #24: Lemmas are empty when the L field is missing for special-cased tokens

This commit is contained in:
Matthew Honnibal 2015-02-08 18:30:30 -05:00
parent 3e8c87af1a
commit 0492cee8b4
1 changed files with 4 additions and 0 deletions

View File

@ -245,6 +245,8 @@ cdef class Tokenizer:
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string) tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
if lemma: if lemma:
tokens[i].lemma = self.vocab.strings[lemma] tokens[i].lemma = self.vocab.strings[lemma]
else:
tokens[i].lemma = 0
if 'pos' in props: if 'pos' in props:
# TODO: Clean up this mess... # TODO: Clean up this mess...
tokens[i].tag = tag_names.index(props['pos']) tokens[i].tag = tag_names.index(props['pos'])
@ -252,6 +254,8 @@ cdef class Tokenizer:
# These are defaults, which can be over-ridden by the # These are defaults, which can be over-ridden by the
# token-specific props. # token-specific props.
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
if tokens[i].lemma == 0:
tokens[i].lemma = tokens[i].lex.orth
set_morph_from_dict(&tokens[i].morph, props) set_morph_from_dict(&tokens[i].morph, props)
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.length = len(substrings) cached.length = len(substrings)