Improve exceptions for 'd (would/had) in English (#5379)

Instead of treating `'d` in contractions like `I'd` as `would` in all
cases in the tokenizer exceptions, leave the tagging and lemmatization
up to later components.
This commit is contained in:
adrianeboyd 2020-05-08 15:10:57 +02:00 committed by GitHub
parent d4cc18b746
commit 440b81bddc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 6 additions and 3 deletions

View File

@ -77,12 +77,12 @@ for pron in ["i", "you", "he", "she", "it", "we", "they"]:
_exc[orth + "'d"] = [ _exc[orth + "'d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "'d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "'d", NORM: "'d"},
] ]
_exc[orth + "d"] = [ _exc[orth + "d"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ORTH: "d", LEMMA: "would", NORM: "would", TAG: "MD"}, {ORTH: "d", NORM: "'d"},
] ]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
@ -195,7 +195,10 @@ for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
{ORTH: "'d", NORM: "'d"}, {ORTH: "'d", NORM: "'d"},
] ]
_exc[orth + "d"] = [{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: "d"}] _exc[orth + "d"] = [
{ORTH: orth, LEMMA: word, NORM: word},
{ORTH: "d", NORM: "'d"}
]
_exc[orth + "'d've"] = [ _exc[orth + "'d've"] = [
{ORTH: orth, LEMMA: word, NORM: word}, {ORTH: orth, LEMMA: word, NORM: word},