From ec0899600047859c8acc040837a88b0381a13d7d Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 20 Nov 2017 15:12:47 +0100
Subject: [PATCH] Add note on tags matching tokenization (see #1613)

---
 examples/training/train_tagger.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index e893cb4e4..6eb7213cf 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -30,8 +30,11 @@ TAG_MAP = {
     'J': {'pos': 'ADJ'}
 }
 
-# Usually you'll read this in, of course. Data formats vary.
-# Ensure your strings are unicode.
+# Usually you'll read this in, of course. Data formats vary. Ensure your
+# strings are unicode and that the number of tags assigned matches spaCy's
+# tokenization. If not, you can always add a 'words' key to the annotations
+# that specifies the gold-standard tokenization, e.g.:
+# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']})
 TRAIN_DATA = [
     ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}),
     ("Eat blue ham", {'tags': ['V', 'J', 'N']})