mirror of https://github.com/explosion/spaCy.git
* Fix Issue #24: Lemmas are empty when the L field is missing for special-cased tokens
This commit is contained in:
parent
3e8c87af1a
commit
0492cee8b4
|
@ -245,6 +245,8 @@ cdef class Tokenizer:
|
||||||
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
|
tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, &string)
|
||||||
if lemma:
|
if lemma:
|
||||||
tokens[i].lemma = self.vocab.strings[lemma]
|
tokens[i].lemma = self.vocab.strings[lemma]
|
||||||
|
else:
|
||||||
|
tokens[i].lemma = 0
|
||||||
if 'pos' in props:
|
if 'pos' in props:
|
||||||
# TODO: Clean up this mess...
|
# TODO: Clean up this mess...
|
||||||
tokens[i].tag = tag_names.index(props['pos'])
|
tokens[i].tag = tag_names.index(props['pos'])
|
||||||
|
@ -252,6 +254,8 @@ cdef class Tokenizer:
|
||||||
# These are defaults, which can be over-ridden by the
|
# These are defaults, which can be over-ridden by the
|
||||||
# token-specific props.
|
# token-specific props.
|
||||||
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
|
set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1])
|
||||||
|
if tokens[i].lemma == 0:
|
||||||
|
tokens[i].lemma = tokens[i].lex.orth
|
||||||
set_morph_from_dict(&tokens[i].morph, props)
|
set_morph_from_dict(&tokens[i].morph, props)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
|
|
Loading…
Reference in New Issue