spaCy/spacy/lang/en/morph_rules.py

494 lines
12 KiB
Python
Raw Normal View History

2017-03-12 12:07:28 +00:00
# coding: utf8
2016-12-18 14:36:15 +00:00
from __future__ import unicode_literals
2019-04-01 10:11:27 +00:00
from ...symbols import LEMMA, PRON_LEMMA
2016-12-18 14:36:15 +00:00
# Several entries here look pretty suspicious. These will get the POS SCONJ
# given the tag IN, when an adpositional reading seems much more likely for
# a lot of these prepositions. I'm not sure what I was running in 04395ffa4
# when I did this? It doesn't seem right.
_subordinating_conjunctions = [
"that",
"if",
"as",
"because",
2019-09-11 12:00:36 +00:00
# "of",
# "for",
# "before",
# "in",
"while",
2019-09-11 12:00:36 +00:00
# "after",
"since",
"like",
2019-09-11 12:00:36 +00:00
# "with",
"so",
2019-09-11 12:00:36 +00:00
# "to",
# "by",
# "on",
# "about",
"than",
"whether",
"although",
2019-09-11 12:00:36 +00:00
# "from",
"though",
2019-09-11 12:00:36 +00:00
# "until",
"unless",
"once",
2019-09-11 12:00:36 +00:00
# "without",
# "at",
# "into",
"cause",
2019-09-11 12:00:36 +00:00
# "over",
"upon",
"till",
"whereas",
2019-09-11 12:00:36 +00:00
# "beyond",
"whilst",
"except",
"despite",
"wether",
2019-09-11 12:00:36 +00:00
# "then",
"but",
"becuse",
"whie",
2019-09-11 12:00:36 +00:00
# "below",
# "against",
"it",
"w/out",
2019-09-11 12:00:36 +00:00
# "toward",
"albeit",
"save",
"besides",
"becouse",
"coz",
"til",
"ask",
"i'd",
"out",
"near",
"seince",
2019-09-11 12:00:36 +00:00
# "towards",
"tho",
"sice",
"will",
]
2019-08-28 14:59:45 +00:00
# This seems kind of wrong too?
2019-09-11 12:00:36 +00:00
# _relative_pronouns = ["this", "that", "those", "these"]
2016-12-18 14:36:15 +00:00
MORPH_RULES = {
2019-09-11 12:00:36 +00:00
# "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": {
"something": {"POS": "PRON"},
"anyone": {"POS": "PRON"},
"anything": {"POS": "PRON"},
"nothing": {"POS": "PRON"},
"someone": {"POS": "PRON"},
"everything": {"POS": "PRON"},
"everyone": {"POS": "PRON"},
"everybody": {"POS": "PRON"},
"nobody": {"POS": "PRON"},
"somebody": {"POS": "PRON"},
"anybody": {"POS": "PRON"},
"any1": {"POS": "PRON"},
},
2016-12-18 14:36:15 +00:00
"PRP": {
"I": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Nom",
},
"me": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
},
"you": {LEMMA: PRON_LEMMA, "POS": "PRON", "PronType": "Prs", "Person": "Two"},
"he": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Nom",
},
"him": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Case": "Acc",
},
"she": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Nom",
},
"her": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Case": "Acc",
},
"it": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
},
"we": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Nom",
},
"us": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
},
"they": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Nom",
},
"them": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
},
"mine": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Poss": "Yes",
"Reflex": "Yes",
},
"his": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"Poss": "Yes",
"Reflex": "Yes",
},
"hers": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"Poss": "Yes",
"Reflex": "Yes",
},
"its": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
"Poss": "Yes",
"Reflex": "Yes",
},
"ours": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"yours": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"theirs": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Poss": "Yes",
"Reflex": "Yes",
},
"myself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Sing",
"Case": "Acc",
"Reflex": "Yes",
},
"yourself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Reflex": "Yes",
},
"himself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Masc",
"Reflex": "Yes",
},
"herself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Fem",
"Reflex": "Yes",
},
"itself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Gender": "Neut",
"Reflex": "Yes",
},
"themself": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Sing",
"Case": "Acc",
"Reflex": "Yes",
},
"ourselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "One",
"Number": "Plur",
"Case": "Acc",
"Reflex": "Yes",
},
"yourselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Two",
"Case": "Acc",
"Reflex": "Yes",
},
"themselves": {
LEMMA: PRON_LEMMA,
"POS": "PRON",
"PronType": "Prs",
"Person": "Three",
"Number": "Plur",
"Case": "Acc",
"Reflex": "Yes",
},
2016-12-18 14:36:15 +00:00
},
"PRP$": {
"my": {
LEMMA: PRON_LEMMA,
"Person": "One",
"Number": "Sing",
"PronType": "Prs",
"Poss": "Yes",
},
"your": {LEMMA: PRON_LEMMA, "Person": "Two", "PronType": "Prs", "Poss": "Yes"},
"his": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Masc",
"PronType": "Prs",
"Poss": "Yes",
},
"her": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Fem",
"PronType": "Prs",
"Poss": "Yes",
},
"its": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Sing",
"Gender": "Neut",
"PronType": "Prs",
"Poss": "Yes",
},
"our": {
LEMMA: PRON_LEMMA,
"Person": "One",
"Number": "Plur",
"PronType": "Prs",
"Poss": "Yes",
},
"their": {
LEMMA: PRON_LEMMA,
"Person": "Three",
"Number": "Plur",
"PronType": "Prs",
"Poss": "Yes",
},
2016-12-18 14:36:15 +00:00
},
"RB": {word: {"POS": "PART"} for word in ["not", "n't", "nt", "nt"]},
"VB": {
word: {"POS": "AUX"}
for word in ["be", "have", "do", "get", "of", "am", "are", "'ve"]
},
"VBN": {"been": {LEMMA: "be", "POS": "AUX"}},
"VBG": {"being": {LEMMA: "be", "POS": "AUX"}},
2016-12-18 14:36:15 +00:00
"VBZ": {
"am": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "One",
"Tense": "Pres",
"Mood": "Ind",
},
"are": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Two",
"Tense": "Pres",
"Mood": "Ind",
},
"is": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Three",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Two",
"Tense": "Pres",
"Mood": "Ind",
},
"'s": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "Three",
"Tense": "Pres",
"Mood": "Ind",
},
2019-03-21 13:22:12 +00:00
"has": {LEMMA: "have", "POS": "AUX"},
"does": {LEMMA: "do", "POS": "AUX"},
2016-12-18 14:36:15 +00:00
},
"VBP": {
"are": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"'re": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Pres",
"Mood": "Ind",
},
"am": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Person": "One",
"Tense": "Pres",
"Mood": "Ind",
},
"do": {"POS": "AUX"},
"have": {"POS": "AUX"},
"'m": {"POS": "AUX", LEMMA: "be"},
"'ve": {"POS": "AUX"},
"'s": {"POS": "AUX"},
"is": {"POS": "AUX"},
"'d": {"POS": "AUX"},
2016-12-18 14:36:15 +00:00
},
"VBD": {
"was": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Sing",
},
"were": {
LEMMA: "be",
"POS": "AUX",
"VerbForm": "Fin",
"Tense": "Past",
"Number": "Plur",
},
2019-03-21 13:22:12 +00:00
"did": {LEMMA: "do", "POS": "AUX"},
"had": {LEMMA: "have", "POS": "AUX"},
"'d": {LEMMA: "have", "POS": "AUX"},
},
2016-12-18 14:36:15 +00:00
}
for tag, rules in MORPH_RULES.items():
for key, attrs in dict(rules).items():
rules[key.title()] = attrs