From ec0899600047859c8acc040837a88b0381a13d7d Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 20 Nov 2017 15:12:47 +0100 Subject: [PATCH] Add note on tags matching tokenization (see #1613) --- examples/training/train_tagger.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index e893cb4e4..6eb7213cf 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -30,8 +30,11 @@ TAG_MAP = { 'J': {'pos': 'ADJ'} } -# Usually you'll read this in, of course. Data formats vary. -# Ensure your strings are unicode. +# Usually you'll read this in, of course. Data formats vary. Ensure your +# strings are unicode and that the number of tags assigned matches spaCy's +# tokenization. If not, you can always add a 'words' key to the annotations +# that specifies the gold-standard tokenization, e.g.: +# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'] 'tags': ['V', 'J', 'N']}) TRAIN_DATA = [ ("I like green eggs", {'tags': ['N', 'V', 'J', 'N']}), ("Eat blue ham", {'tags': ['V', 'J', 'N']})