From 704c7442e010d28d9d6f1c3173a90e4e970d13de Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Dec 2016 15:35:36 +0100 Subject: [PATCH] Break language data components into their own files --- spacy/en/stop_words.py | 67 ++++++++ spacy/en/tag_map.py | 64 ++++++++ ...nguage_data.py => tokenizer_exceptions.py} | 153 ------------------ 3 files changed, 131 insertions(+), 153 deletions(-) create mode 100644 spacy/en/stop_words.py create mode 100644 spacy/en/tag_map.py rename spacy/en/{language_data.py => tokenizer_exceptions.py} (88%) diff --git a/spacy/en/stop_words.py b/spacy/en/stop_words.py new file mode 100644 index 000000000..1b00eb974 --- /dev/null +++ b/spacy/en/stop_words.py @@ -0,0 +1,67 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set(""" +a about above across after afterwards again against all almost alone along +already also although always am among amongst amount an and another any anyhow +anyone anything anyway anywhere are around as at + +back be became because become becomes becoming been before beforehand behind +being below beside besides between beyond both bottom but by + +call can cannot ca could + +did do does doing done down due during + +each eight either eleven else elsewhere empty enough etc even ever every +everyone everything everywhere except + +few fifteen fifty first five for former formerly forty four from front full +further + +get give go + +had has have he hence her here hereafter hereby herein hereupon hers herself +him himself his how however hundred + +i if in inc indeed into is it its itself + +keep + +last latter latterly least less + +just + +made make many may me meanwhile might mine more moreover most mostly move much +must my myself + +name namely neither never nevertheless next nine no nobody none noone nor not +nothing now nowhere + +of off often on once one only onto or other others otherwise our ours ourselves +out over own + +part per perhaps please put + +quite + +rather re really regarding + +same say see seem seemed seeming seems serious several she should show side +since six sixty so some somehow someone something sometime sometimes somewhere +still such + +take ten than that the their them themselves then thence there thereafter +thereby therefore therein thereupon these they third this those though three +through throughout thru thus to together too top toward towards twelve twenty +two + +under until up unless upon us used using + +various very very via was we well were what whatever when whence whenever where +whereafter whereas whereby wherein whereupon wherever whether which while +whither who whoever whole whom whose why will with within without would + +yet you your yours yourself yourselves +""".split()) diff --git a/spacy/en/tag_map.py b/spacy/en/tag_map.py new file mode 100644 index 000000000..7a3589d0e --- /dev/null +++ b/spacy/en/tag_map.py @@ -0,0 +1,64 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + ".": {POS: PUNCT, "PunctType": "peri"}, + ",": {POS: PUNCT, "PunctType": "comm"}, + "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, + "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, + "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, + "\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, + ":": {POS: PUNCT}, + "$": {POS: SYM, "Other": {"SymType": "currency"}}, + "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, + "AFX": {POS: ADJ, "Hyph": "yes"}, + "CC": {POS: CONJ, "ConjType": "coor"}, + "CD": {POS: NUM, "NumType": "card"}, + "DT": {POS: DET}, + "EX": {POS: ADV, "AdvType": "ex"}, + "FW": {POS: X, "Foreign": "yes"}, + "HYPH": {POS: PUNCT, "PunctType": "dash"}, + "IN": {POS: ADP}, + "JJ": {POS: ADJ, "Degree": "pos"}, + "JJR": {POS: ADJ, "Degree": "comp"}, + "JJS": {POS: ADJ, "Degree": "sup"}, + "LS": {POS: PUNCT, "NumType": "ord"}, + "MD": {POS: VERB, "VerbType": "mod"}, + "NIL": {POS: ""}, + "NN": {POS: NOUN, "Number": "sing"}, + "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, + "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, + "NNS": {POS: NOUN, "Number": "plur"}, + "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, + "POS": {POS: PART, "Poss": "yes"}, + "PRP": {POS: PRON, "PronType": "prs"}, + "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, + "RB": {POS: ADV, "Degree": "pos"}, + "RBR": {POS: ADV, "Degree": "comp"}, + "RBS": {POS: ADV, "Degree": "sup"}, + "RP": {POS: PART}, + "SYM": {POS: SYM}, + "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, + "UH": {POS: INTJ}, + "VB": {POS: VERB, "VerbForm": "inf"}, + "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, + "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, + "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, + "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, + "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, + "WDT": {POS: ADJ, "PronType": "int|rel"}, + "WP": {POS: NOUN, "PronType": "int|rel"}, + "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, + "WRB": {POS: ADV, "PronType": "int|rel"}, + "SP": {POS: SPACE}, + "ADD": {POS: X}, + "NFP": {POS: PUNCT}, + "GW": {POS: X}, + "XX": {POS: X}, + "BES": {POS: VERB}, + "HVS": {POS: VERB} +} diff --git a/spacy/en/language_data.py b/spacy/en/tokenizer_exceptions.py similarity index 88% rename from spacy/en/language_data.py rename to spacy/en/tokenizer_exceptions.py index 3daa00767..56cc1d7fa 100644 --- a/spacy/en/language_data.py +++ b/spacy/en/tokenizer_exceptions.py @@ -3,159 +3,6 @@ from __future__ import unicode_literals from ..symbols import * from ..language_data import PRON_LEMMA -from ..language_data import TOKENIZER_PREFIXES -from ..language_data import TOKENIZER_SUFFIXES -from ..language_data import TOKENIZER_INFIXES - - -def get_time_exc(hours): - exc = {} - for hour in hours: - exc["%da.m." % hour] = [ - {ORTH: hour}, - {ORTH: "a.m."} - ] - - exc["%dp.m." % hour] = [ - {ORTH: hour}, - {ORTH: "p.m."} - ] - - exc["%dam" % hour] = [ - {ORTH: hour}, - {ORTH: "am", LEMMA: "a.m."} - ] - - exc["%dpm" % hour] = [ - {ORTH: hour}, - {ORTH: "pm", LEMMA: "p.m."} - ] - return exc - - -TAG_MAP = { - ".": {POS: PUNCT, "PunctType": "peri"}, - ",": {POS: PUNCT, "PunctType": "comm"}, - "-LRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "ini"}, - "-RRB-": {POS: PUNCT, "PunctType": "brck", "PunctSide": "fin"}, - "``": {POS: PUNCT, "PunctType": "quot", "PunctSide": "ini"}, - "\"\"": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, - "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, - ":": {POS: PUNCT}, - "$": {POS: SYM, "Other": {"SymType": "currency"}}, - "#": {POS: SYM, "Other": {"SymType": "numbersign"}}, - "AFX": {POS: ADJ, "Hyph": "yes"}, - "CC": {POS: CONJ, "ConjType": "coor"}, - "CD": {POS: NUM, "NumType": "card"}, - "DT": {POS: DET}, - "EX": {POS: ADV, "AdvType": "ex"}, - "FW": {POS: X, "Foreign": "yes"}, - "HYPH": {POS: PUNCT, "PunctType": "dash"}, - "IN": {POS: ADP}, - "JJ": {POS: ADJ, "Degree": "pos"}, - "JJR": {POS: ADJ, "Degree": "comp"}, - "JJS": {POS: ADJ, "Degree": "sup"}, - "LS": {POS: PUNCT, "NumType": "ord"}, - "MD": {POS: VERB, "VerbType": "mod"}, - "NIL": {POS: ""}, - "NN": {POS: NOUN, "Number": "sing"}, - "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, - "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, - "NNS": {POS: NOUN, "Number": "plur"}, - "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"}, - "POS": {POS: PART, "Poss": "yes"}, - "PRP": {POS: PRON, "PronType": "prs"}, - "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"}, - "RB": {POS: ADV, "Degree": "pos"}, - "RBR": {POS: ADV, "Degree": "comp"}, - "RBS": {POS: ADV, "Degree": "sup"}, - "RP": {POS: PART}, - "SYM": {POS: SYM}, - "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, - "UH": {POS: INTJ}, - "VB": {POS: VERB, "VerbForm": "inf"}, - "VBD": {POS: VERB, "VerbForm": "fin", "Tense": "past"}, - "VBG": {POS: VERB, "VerbForm": "part", "Tense": "pres", "Aspect": "prog"}, - "VBN": {POS: VERB, "VerbForm": "part", "Tense": "past", "Aspect": "perf"}, - "VBP": {POS: VERB, "VerbForm": "fin", "Tense": "pres"}, - "VBZ": {POS: VERB, "VerbForm": "fin", "Tense": "pres", "Number": "sing", "Person": 3}, - "WDT": {POS: ADJ, "PronType": "int|rel"}, - "WP": {POS: NOUN, "PronType": "int|rel"}, - "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, - "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, - "ADD": {POS: X}, - "NFP": {POS: PUNCT}, - "GW": {POS: X}, - "XX": {POS: X}, - "BES": {POS: VERB}, - "HVS": {POS: VERB} -} - - -STOP_WORDS = set(""" -a about above across after afterwards again against all almost alone along -already also although always am among amongst amount an and another any anyhow -anyone anything anyway anywhere are around as at - -back be became because become becomes becoming been before beforehand behind -being below beside besides between beyond both bottom but by - -call can cannot ca could - -did do does doing done down due during - -each eight either eleven else elsewhere empty enough etc even ever every -everyone everything everywhere except - -few fifteen fifty first five for former formerly forty four from front full -further - -get give go - -had has have he hence her here hereafter hereby herein hereupon hers herself -him himself his how however hundred - -i if in inc indeed into is it its itself - -keep - -last latter latterly least less - -just - -made make many may me meanwhile might mine more moreover most mostly move much -must my myself - -name namely neither never nevertheless next nine no nobody none noone nor not -nothing now nowhere - -of off often on once one only onto or other others otherwise our ours ourselves -out over own - -part per perhaps please put - -quite - -rather re really regarding - -same say see seem seemed seeming seems serious several she should show side -since six sixty so some somehow someone something sometime sometimes somewhere -still such - -take ten than that the their them themselves then thence there thereafter -thereby therefore therein thereupon these they third this those though three -through throughout thru thus to together too top toward towards twelve twenty -two - -under until up unless upon us used using - -various very very via was we well were what whatever when whence whenever where -whereafter whereas whereby wherein whereupon wherever whether which while -whither who whoever whole whom whose why will with within without would - -yet you your yours yourself yourselves -""".split()) TOKENIZER_EXCEPTIONS = {