From 31babe3c3f7b02a76b817d00fa78e2583a229659 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 6 Nov 2017 12:36:05 +0100 Subject: [PATCH] Fix non-clobbering lemmatization --- spacy/morphology.pyx | 3 +-- spacy/pipeline.pyx | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 3414eec5f..a5c5c0fbe 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -109,8 +109,7 @@ cdef class Morphology: analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth, self.tag_map.get(tag_str, {})) self._cache.set(tag_id, token.lex.orth, analysis) - if token.lemma == 0: - token.lemma = analysis.lemma + token.lemma = analysis.lemma token.pos = analysis.tag.pos token.tag = analysis.tag.name token.morph = analysis.tag.morph diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index f3defeeb9..5291b6b5e 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -412,7 +412,11 @@ class Tagger(Pipe): for j, tag_id in enumerate(doc_tag_ids): # Don't clobber preset POS tags if doc.c[j].tag == 0 and doc.c[j].pos == 0: + # Don't clobber preset lemmas + lemma = doc.c[j].lemma vocab.morphology.assign_tag_id(&doc.c[j], tag_id) + if lemma != 0: + doc.c[j].lemma = lemma idx += 1 if tensors is not None: if isinstance(doc.tensor, numpy.ndarray) \