Add missing lemmas to tokenizer exceptions (fixes #674)

This commit is contained in:
Ines Montani 2016-12-17 12:42:41 +01:00
parent 5445074cbd
commit a22322187f
1 changed files with 17 additions and 17 deletions

View File

@ -229,7 +229,7 @@ TOKENIZER_EXCEPTIONS = {
"Who're": [ "Who're": [
{ORTH: "Who"}, {ORTH: "Who"},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"Ain't": [ "Ain't": [
@ -376,7 +376,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Shan't": [ "Shan't": [
{ORTH: "Sha"}, {ORTH: "Sha", LEMMA: "shall"},
{ORTH: "n't", LEMMA: "not", TAG: "RB"} {ORTH: "n't", LEMMA: "not", TAG: "RB"}
], ],
@ -474,7 +474,7 @@ TOKENIZER_EXCEPTIONS = {
"who're": [ "who're": [
{ORTH: "who"}, {ORTH: "who"},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"Whys": [ "Whys": [
@ -718,7 +718,7 @@ TOKENIZER_EXCEPTIONS = {
"what're": [ "what're": [
{ORTH: "what"}, {ORTH: "what"},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"Wasn't": [ "Wasn't": [
@ -918,7 +918,7 @@ TOKENIZER_EXCEPTIONS = {
"What're": [ "What're": [
{ORTH: "What"}, {ORTH: "What"},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"He'll": [ "He'll": [
@ -933,7 +933,7 @@ TOKENIZER_EXCEPTIONS = {
"They're": [ "They're": [
{ORTH: "They", LEMMA: PRON_LEMMA}, {ORTH: "They", LEMMA: PRON_LEMMA},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"shouldnt": [ "shouldnt": [
@ -997,7 +997,7 @@ TOKENIZER_EXCEPTIONS = {
"they're": [ "they're": [
{ORTH: "they", LEMMA: PRON_LEMMA}, {ORTH: "they", LEMMA: PRON_LEMMA},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"idve": [ "idve": [
@ -1048,7 +1048,7 @@ TOKENIZER_EXCEPTIONS = {
"You're": [ "You're": [
{ORTH: "You", LEMMA: PRON_LEMMA}, {ORTH: "You", LEMMA: PRON_LEMMA},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"she'll": [ "she'll": [
@ -1083,13 +1083,13 @@ TOKENIZER_EXCEPTIONS = {
], ],
"won't": [ "won't": [
{ORTH: "wo"}, {ORTH: "wo", LEMMA: "will"},
{ORTH: "n't", LEMMA: "not", TAG: "RB"} {ORTH: "n't", LEMMA: "not", TAG: "RB"}
], ],
"We're": [ "We're": [
{ORTH: "We"}, {ORTH: "We", LEMMA: PRON_LEMMA},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"\u2018S": [ "\u2018S": [
@ -1348,7 +1348,7 @@ TOKENIZER_EXCEPTIONS = {
"why're": [ "why're": [
{ORTH: "why"}, {ORTH: "why"},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"Doesnt": [ "Doesnt": [
@ -1393,7 +1393,7 @@ TOKENIZER_EXCEPTIONS = {
"you're": [ "you're": [
{ORTH: "you", LEMMA: PRON_LEMMA}, {ORTH: "you", LEMMA: PRON_LEMMA},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"They've": [ "They've": [
@ -1457,7 +1457,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"Won't": [ "Won't": [
{ORTH: "Wo"}, {ORTH: "Wo", LEMMA: "will"},
{ORTH: "n't", LEMMA: "not", TAG: "RB"} {ORTH: "n't", LEMMA: "not", TAG: "RB"}
], ],
@ -1602,8 +1602,8 @@ TOKENIZER_EXCEPTIONS = {
], ],
"we're": [ "we're": [
{ORTH: "we"}, {ORTH: "we", LEMMA: PRON_LEMMA},
{ORTH: "'re"} {ORTH: "'re", LEMMA: "be"}
], ],
"Hadnt": [ "Hadnt": [
@ -1824,7 +1824,7 @@ TOKENIZER_EXCEPTIONS = {
], ],
"shan't": [ "shan't": [
{ORTH: "sha"}, {ORTH: "sha", LEMMA: "shall"},
{ORTH: "n't", LEMMA: "not", TAG: "RB"} {ORTH: "n't", LEMMA: "not", TAG: "RB"}
], ],