Add note on tags matching tokenization (see #1613)

2017-11-20 15:12:47 +01:00 · 2017-11-20 15:12:47 +01:00 · ec08996000
parent ac235c0baf
commit ec08996000
1 changed files with 5 additions and 2 deletions
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -30,8 +30,11 @@ TAG_MAP = {
    'J': {'pos': 'ADJ'}
 }

-# Usually you'll read this in, of course. Data formats vary.
-# Ensure your strings are unicode.
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
 TRAIN_DATA = [
    ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
    ("Eat blue ham", {'tags': ['V', 'J', 'N']})