mirror of https://github.com/explosion/spaCy.git
Break language data components into their own files
This commit is contained in:
parent
62655fd36f
commit
704c7442e0
|
@ -0,0 +1,67 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
STOP_WORDS = set("""
|
||||
a about above across after afterwards again against all almost alone along
|
||||
already also although always am among amongst amount an and another any anyhow
|
||||
anyone anything anyway anywhere are around as at
|
||||
|
||||
back be became because become becomes becoming been before beforehand behind
|
||||
being below beside besides between beyond both bottom but by
|
||||
|
||||
call can cannot ca could
|
||||
|
||||
did do does doing done down due during
|
||||
|
||||
each eight either eleven else elsewhere empty enough etc even ever every
|
||||
everyone everything everywhere except
|
||||
|
||||
few fifteen fifty first five for former formerly forty four from front full
|
||||
further
|
||||
|
||||
get give go
|
||||
|
||||
had has have he hence her here hereafter hereby herein hereupon hers herself
|
||||
him himself his how however hundred
|
||||
|
||||
i if in inc indeed into is it its itself
|
||||
|
||||
keep
|
||||
|
||||
last latter latterly least less
|
||||
|
||||
just
|
||||
|
||||
made make many may me meanwhile might mine more moreover most mostly move much
|
||||
must my myself
|
||||
|
||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||
nothing now nowhere
|
||||
|
||||
of off often on once one only onto or other others otherwise our ours ourselves
|
||||
out over own
|
||||
|
||||
part per perhaps please put
|
||||
|
||||
quite
|
||||
|
||||
rather re really regarding
|
||||
|
||||
same say see seem seemed seeming seems serious several she should show side
|
||||
since six sixty so some somehow someone something sometime sometimes somewhere
|
||||
still such
|
||||
|
||||
take ten than that the their them themselves then thence there thereafter
|
||||
thereby therefore therein thereupon these they third this those though three
|
||||
through throughout thru thus to together too top toward towards twelve twenty
|
||||
two
|
||||
|
||||
under until up unless upon us used using
|
||||
|
||||
various very very via was we well were what whatever when whence whenever where
|
||||
whereafter whereas whereby wherein whereupon wherever whether which while
|
||||
whither who whoever whole whom whose why will with within without would
|
||||
|
||||
yet you your yours yourself yourselves
|
||||
""".split())
|
|
@ -0,0 +1,64 @@
|
|||
# encoding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..symbols import *
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
".": {POS: PUNCT, "PunctType": "peri"},
|
||||
",": {POS: PUNCT, "PunctType": "comm"},
|
||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
"NNS": {POS: NOUN, "Number": "plur"},
|
||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"SYM": {POS: SYM},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
"UH": {POS: INTJ},
|
||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
"GW": {POS: X},
|
||||
"XX": {POS: X},
|
||||
"BES": {POS: VERB},
|
||||
"HVS": {POS: VERB}
|
||||
}
|
|
@ -3,159 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
from ..symbols import *
|
||||
from ..language_data import PRON_LEMMA
|
||||
from ..language_data import TOKENIZER_PREFIXES
|
||||
from ..language_data import TOKENIZER_SUFFIXES
|
||||
from ..language_data import TOKENIZER_INFIXES
|
||||
|
||||
|
||||
def get_time_exc(hours):
|
||||
exc = {}
|
||||
for hour in hours:
|
||||
exc["%da.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dp.m." % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "p.m."}
|
||||
]
|
||||
|
||||
exc["%dam" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "am", LEMMA: "a.m."}
|
||||
]
|
||||
|
||||
exc["%dpm" % hour] = [
|
||||
{ORTH: hour},
|
||||
{ORTH: "pm", LEMMA: "p.m."}
|
||||
]
|
||||
return exc
|
||||
|
||||
|
||||
TAG_MAP = {
|
||||
".": {POS: PUNCT, "PunctType": "peri"},
|
||||
",": {POS: PUNCT, "PunctType": "comm"},
|
||||
"-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"},
|
||||
"-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"},
|
||||
"``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"},
|
||||
"\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
|
||||
":": {POS: PUNCT},
|
||||
"$": {POS: SYM, "Other": {"SymType": "currency"}},
|
||||
"#": {POS: SYM, "Other": {"SymType": "numbersign"}},
|
||||
"AFX": {POS: ADJ, "Hyph": "yes"},
|
||||
"CC": {POS: CONJ, "ConjType": "coor"},
|
||||
"CD": {POS: NUM, "NumType": "card"},
|
||||
"DT": {POS: DET},
|
||||
"EX": {POS: ADV, "AdvType": "ex"},
|
||||
"FW": {POS: X, "Foreign": "yes"},
|
||||
"HYPH": {POS: PUNCT, "PunctType": "dash"},
|
||||
"IN": {POS: ADP},
|
||||
"JJ": {POS: ADJ, "Degree": "pos"},
|
||||
"JJR": {POS: ADJ, "Degree": "comp"},
|
||||
"JJS": {POS: ADJ, "Degree": "sup"},
|
||||
"LS": {POS: PUNCT, "NumType": "ord"},
|
||||
"MD": {POS: VERB, "VerbType": "mod"},
|
||||
"NIL": {POS: ""},
|
||||
"NN": {POS: NOUN, "Number": "sing"},
|
||||
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
|
||||
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
|
||||
"NNS": {POS: NOUN, "Number": "plur"},
|
||||
"PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
|
||||
"POS": {POS: PART, "Poss": "yes"},
|
||||
"PRP": {POS: PRON, "PronType": "prs"},
|
||||
"PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
|
||||
"RB": {POS: ADV, "Degree": "pos"},
|
||||
"RBR": {POS: ADV, "Degree": "comp"},
|
||||
"RBS": {POS: ADV, "Degree": "sup"},
|
||||
"RP": {POS: PART},
|
||||
"SYM": {POS: SYM},
|
||||
"TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"},
|
||||
"UH": {POS: INTJ},
|
||||
"VB": {POS: VERB, "VerbForm": "inf"},
|
||||
"VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"},
|
||||
"VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"},
|
||||
"VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"},
|
||||
"VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"},
|
||||
"VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3},
|
||||
"WDT": {POS: ADJ, "PronType": "int|rel"},
|
||||
"WP": {POS: NOUN, "PronType": "int|rel"},
|
||||
"WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
|
||||
"WRB": {POS: ADV, "PronType": "int|rel"},
|
||||
"SP": {POS: SPACE},
|
||||
"ADD": {POS: X},
|
||||
"NFP": {POS: PUNCT},
|
||||
"GW": {POS: X},
|
||||
"XX": {POS: X},
|
||||
"BES": {POS: VERB},
|
||||
"HVS": {POS: VERB}
|
||||
}
|
||||
|
||||
|
||||
STOP_WORDS = set("""
|
||||
a about above across after afterwards again against all almost alone along
|
||||
already also although always am among amongst amount an and another any anyhow
|
||||
anyone anything anyway anywhere are around as at
|
||||
|
||||
back be became because become becomes becoming been before beforehand behind
|
||||
being below beside besides between beyond both bottom but by
|
||||
|
||||
call can cannot ca could
|
||||
|
||||
did do does doing done down due during
|
||||
|
||||
each eight either eleven else elsewhere empty enough etc even ever every
|
||||
everyone everything everywhere except
|
||||
|
||||
few fifteen fifty first five for former formerly forty four from front full
|
||||
further
|
||||
|
||||
get give go
|
||||
|
||||
had has have he hence her here hereafter hereby herein hereupon hers herself
|
||||
him himself his how however hundred
|
||||
|
||||
i if in inc indeed into is it its itself
|
||||
|
||||
keep
|
||||
|
||||
last latter latterly least less
|
||||
|
||||
just
|
||||
|
||||
made make many may me meanwhile might mine more moreover most mostly move much
|
||||
must my myself
|
||||
|
||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||
nothing now nowhere
|
||||
|
||||
of off often on once one only onto or other others otherwise our ours ourselves
|
||||
out over own
|
||||
|
||||
part per perhaps please put
|
||||
|
||||
quite
|
||||
|
||||
rather re really regarding
|
||||
|
||||
same say see seem seemed seeming seems serious several she should show side
|
||||
since six sixty so some somehow someone something sometime sometimes somewhere
|
||||
still such
|
||||
|
||||
take ten than that the their them themselves then thence there thereafter
|
||||
thereby therefore therein thereupon these they third this those though three
|
||||
through throughout thru thus to together too top toward towards twelve twenty
|
||||
two
|
||||
|
||||
under until up unless upon us used using
|
||||
|
||||
various very very via was we well were what whatever when whence whenever where
|
||||
whereafter whereas whereby wherein whereupon wherever whether which while
|
||||
whither who whoever whole whom whose why will with within without would
|
||||
|
||||
yet you your yours yourself yourselves
|
||||
""".split())
|
||||
|
||||
|
||||
TOKENIZER_EXCEPTIONS = {
|
Loading…
Reference in New Issue