From a22322187f3f6b83541db2b64488c66efdf724d3 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sat, 17 Dec 2016 12:42:41 +0100
Subject: [PATCH] Add missing lemmas to tokenizer exceptions (fixes #674)

---
 spacy/en/language_data.py | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/spacy/en/language_data.py b/spacy/en/language_data.py
index edf44468b..48190d8ad 100644
--- a/spacy/en/language_data.py
+++ b/spacy/en/language_data.py
@@ -229,7 +229,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "Who're": [
         {ORTH: "Who"},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "Ain't": [
@@ -376,7 +376,7 @@ TOKENIZER_EXCEPTIONS = {
     ],
 
     "Shan't": [
-        {ORTH: "Sha"},
+        {ORTH: "Sha", LEMMA: "shall"},
         {ORTH: "n't", LEMMA: "not", TAG: "RB"}
     ],
 
@@ -474,7 +474,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "who're": [
         {ORTH: "who"},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "Whys": [
@@ -718,7 +718,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "what're": [
         {ORTH: "what"},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "Wasn't": [
@@ -918,7 +918,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "What're": [
         {ORTH: "What"},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "He'll": [
@@ -933,7 +933,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "They're": [
         {ORTH: "They", LEMMA: PRON_LEMMA},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "shouldnt": [
@@ -997,7 +997,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "they're": [
         {ORTH: "they", LEMMA: PRON_LEMMA},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "idve": [
@@ -1048,7 +1048,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "You're": [
         {ORTH: "You", LEMMA: PRON_LEMMA},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "she'll": [
@@ -1083,13 +1083,13 @@ TOKENIZER_EXCEPTIONS = {
     ],
 
     "won't": [
-        {ORTH: "wo"},
+        {ORTH: "wo", LEMMA: "will"},
         {ORTH: "n't", LEMMA: "not", TAG: "RB"}
     ],
 
     "We're": [
-        {ORTH: "We"},
-        {ORTH: "'re"}
+        {ORTH: "We", LEMMA: PRON_LEMMA},
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "\u2018S": [
@@ -1348,7 +1348,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "why're": [
         {ORTH: "why"},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "Doesnt": [
@@ -1393,7 +1393,7 @@ TOKENIZER_EXCEPTIONS = {
 
     "you're": [
         {ORTH: "you", LEMMA: PRON_LEMMA},
-        {ORTH: "'re"}
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "They've": [
@@ -1457,7 +1457,7 @@ TOKENIZER_EXCEPTIONS = {
     ],
 
     "Won't": [
-        {ORTH: "Wo"},
+        {ORTH: "Wo", LEMMA: "will"},
         {ORTH: "n't", LEMMA: "not", TAG: "RB"}
     ],
 
@@ -1602,8 +1602,8 @@ TOKENIZER_EXCEPTIONS = {
     ],
 
     "we're": [
-        {ORTH: "we"},
-        {ORTH: "'re"}
+        {ORTH: "we", LEMMA: PRON_LEMMA},
+        {ORTH: "'re", LEMMA: "be"}
     ],
 
     "Hadnt": [
@@ -1824,7 +1824,7 @@ TOKENIZER_EXCEPTIONS = {
     ],
 
     "shan't": [
-        {ORTH: "sha"},
+        {ORTH: "sha", LEMMA: "shall"},
         {ORTH: "n't", LEMMA: "not", TAG: "RB"}
     ],