Fix retokenizer

2020-12-09 11:29:55 +11:00 · 2020-12-09 11:29:55 +11:00 · 271923eaea
parent 04b3068747
commit 271923eaea
1 changed files with 4 additions and 3 deletions
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@ -11,7 +11,7 @@ from .span cimport Span
 from .token cimport Token
 from ..lexeme cimport Lexeme, EMPTY_LEXEME
 from ..structs cimport LexemeC, TokenC
-from ..attrs cimport MORPH
+from ..attrs cimport MORPH, NORM
 from ..vocab cimport Vocab

 from .underscore import is_writable_attr
@ -436,6 +436,7 @@ def set_token_attrs(Token py_token, attrs):
            # Set attributes on both token and lexeme to take care of token
            # attribute vs. lexical attribute without having to enumerate
            # them. If an attribute name is not valid, set_struct_attr will
-            # ignore it.
+            # ignore it. Exception: set NORM only on tokens.
            Token.set_struct_attr(token, attr_name, attr_value)
-            Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)
+            if attr_name != NORM:
+                Lexeme.set_struct_attr(<LexemeC*>lex, attr_name, attr_value)