spaCy/spacy/glossary.py

323 lines
12 KiB
Python

# coding: utf8
from __future__ import unicode_literals
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
term (unicode): The term to explain.
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
EXAMPLE:
>>> spacy.explain(u'NORP')
>>> doc = nlp(u'Hello world')
>>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
"""
if term in GLOSSARY:
return GLOSSARY[term]
GLOSSARY = {
# POS tags
# Universal POS Tags
# http://universaldependencies.org/u/pos/
"ADJ": "adjective",
"ADP": "adposition",
"ADV": "adverb",
"AUX": "auxiliary",
"CONJ": "conjunction",
"CCONJ": "coordinating conjunction",
"DET": "determiner",
"INTJ": "interjection",
"NOUN": "noun",
"NUM": "numeral",
"PART": "particle",
"PRON": "pronoun",
"PROPN": "proper noun",
"PUNCT": "punctuation",
"SCONJ": "subordinating conjunction",
"SYM": "symbol",
"VERB": "verb",
"X": "other",
"EOL": "end of line",
"SPACE": "space",
# POS tags (English)
# OntoNotes 5 / Penn Treebank
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
".": "punctuation mark, sentence closer",
",": "punctuation mark, comma",
"-LRB-": "left round bracket",
"-RRB-": "right round bracket",
"``": "opening quotation mark",
'""': "closing quotation mark",
"''": "closing quotation mark",
":": "punctuation mark, colon or ellipsis",
"$": "symbol, currency",
"#": "symbol, number sign",
"AFX": "affix",
"CC": "conjunction, coordinating",
"CD": "cardinal number",
"DT": "determiner",
"EX": "existential there",
"FW": "foreign word",
"HYPH": "punctuation mark, hyphen",
"IN": "conjunction, subordinating or preposition",
"JJ": "adjective",
"JJR": "adjective, comparative",
"JJS": "adjective, superlative",
"LS": "list item marker",
"MD": "verb, modal auxiliary",
"NIL": "missing tag",
"NN": "noun, singular or mass",
"NNP": "noun, proper singular",
"NNPS": "noun, proper plural",
"NNS": "noun, plural",
"PDT": "predeterminer",
"POS": "possessive ending",
"PRP": "pronoun, personal",
"PRP$": "pronoun, possessive",
"RB": "adverb",
"RBR": "adverb, comparative",
"RBS": "adverb, superlative",
"RP": "adverb, particle",
"TO": 'infinitival "to"',
"UH": "interjection",
"VB": "verb, base form",
"VBD": "verb, past tense",
"VBG": "verb, gerund or present participle",
"VBN": "verb, past participle",
"VBP": "verb, non-3rd person singular present",
"VBZ": "verb, 3rd person singular present",
"WDT": "wh-determiner",
"WP": "wh-pronoun, personal",
"WP$": "wh-pronoun, possessive",
"WRB": "wh-adverb",
"SP": "space",
"ADD": "email",
"NFP": "superfluous punctuation",
"GW": "additional word in multi-word expression",
"XX": "unknown",
"BES": 'auxiliary "be"',
"HVS": 'forms of "have"',
# POS Tags (German)
# TIGER Treebank
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
"$(": "other sentence-internal punctuation mark",
"$,": "comma",
"$.": "sentence-final punctuation mark",
"ADJA": "adjective, attributive",
"ADJD": "adjective, adverbial or predicative",
"APPO": "postposition",
"APPR": "preposition; circumposition left",
"APPRART": "preposition with article",
"APZR": "circumposition right",
"ART": "definite or indefinite article",
"CARD": "cardinal number",
"FM": "foreign language material",
"ITJ": "interjection",
"KOKOM": "comparative conjunction",
"KON": "coordinate conjunction",
"KOUI": 'subordinate conjunction with "zu" and infinitive',
"KOUS": "subordinate conjunction with sentence",
"NE": "proper noun",
"NNE": "proper noun",
"PAV": "pronominal adverb",
"PROAV": "pronominal adverb",
"PDAT": "attributive demonstrative pronoun",
"PDS": "substituting demonstrative pronoun",
"PIAT": "attributive indefinite pronoun without determiner",
"PIDAT": "attributive indefinite pronoun with determiner",
"PIS": "substituting indefinite pronoun",
"PPER": "non-reflexive personal pronoun",
"PPOSAT": "attributive possessive pronoun",
"PPOSS": "substituting possessive pronoun",
"PRELAT": "attributive relative pronoun",
"PRELS": "substituting relative pronoun",
"PRF": "reflexive personal pronoun",
"PTKA": "particle with adjective or adverb",
"PTKANT": "answer particle",
"PTKNEG": "negative particle",
"PTKVZ": "separable verbal particle",
"PTKZU": '"zu" before infinitive',
"PWAT": "attributive interrogative pronoun",
"PWAV": "adverbial interrogative or relative pronoun",
"PWS": "substituting interrogative pronoun",
"TRUNC": "word remnant",
"VAFIN": "finite verb, auxiliary",
"VAIMP": "imperative, auxiliary",
"VAINF": "infinitive, auxiliary",
"VAPP": "perfect participle, auxiliary",
"VMFIN": "finite verb, modal",
"VMINF": "infinitive, modal",
"VMPP": "perfect participle, modal",
"VVFIN": "finite verb, full",
"VVIMP": "imperative, full",
"VVINF": "infinitive, full",
"VVIZU": 'infinitive with "zu", full',
"VVPP": "perfect participle, full",
"XY": "non-word containing non-letter",
# Noun chunks
"NP": "noun phrase",
"PP": "prepositional phrase",
"VP": "verb phrase",
"ADVP": "adverb phrase",
"ADJP": "adjective phrase",
"SBAR": "subordinating conjunction",
"PRT": "particle",
"PNP": "prepositional noun phrase",
# Dependency Labels (English)
# ClearNLP / Universal Dependencies
# https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
"acl": "clausal modifier of noun (adjectival clause)",
"acomp": "adjectival complement",
"advcl": "adverbial clause modifier",
"advmod": "adverbial modifier",
"agent": "agent",
"amod": "adjectival modifier",
"appos": "appositional modifier",
"attr": "attribute",
"aux": "auxiliary",
"auxpass": "auxiliary (passive)",
"case": "case marking",
"cc": "coordinating conjunction",
"ccomp": "clausal complement",
"clf": "classifier",
"complm": "complementizer",
"compound": "compound",
"conj": "conjunct",
"cop": "copula",
"csubj": "clausal subject",
"csubjpass": "clausal subject (passive)",
"dative": "dative",
"dep": "unclassified dependent",
"det": "determiner",
"discourse": "discourse element",
"dislocated": "dislocated elements",
"dobj": "direct object",
"expl": "expletive",
"fixed": "fixed multiword expression",
"flat": "flat multiword expression",
"goeswith": "goes with",
"hmod": "modifier in hyphenation",
"hyph": "hyphen",
"infmod": "infinitival modifier",
"intj": "interjection",
"iobj": "indirect object",
"list": "list",
"mark": "marker",
"meta": "meta modifier",
"neg": "negation modifier",
"nmod": "modifier of nominal",
"nn": "noun compound modifier",
"npadvmod": "noun phrase as adverbial modifier",
"nsubj": "nominal subject",
"nsubjpass": "nominal subject (passive)",
"nounmod": "modifier of nominal",
"npmod": "noun phrase as adverbial modifier",
"num": "number modifier",
"number": "number compound modifier",
"nummod": "numeric modifier",
"oprd": "object predicate",
"obj": "object",
"obl": "oblique nominal",
"orphan": "orphan",
"parataxis": "parataxis",
"partmod": "participal modifier",
"pcomp": "complement of preposition",
"pobj": "object of preposition",
"poss": "possession modifier",
"possessive": "possessive modifier",
"preconj": "pre-correlative conjunction",
"prep": "prepositional modifier",
"prt": "particle",
"punct": "punctuation",
"quantmod": "modifier of quantifier",
"rcmod": "relative clause modifier",
"relcl": "relative clause modifier",
"reparandum": "overridden disfluency",
"root": "root",
"vocative": "vocative",
"xcomp": "open clausal complement",
# Dependency labels (German)
# TIGER Treebank
# http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
# currently missing: 'cc' (comparative complement) because of conflict
# with English labels
"ac": "adpositional case marker",
"adc": "adjective component",
"ag": "genitive attribute",
"ams": "measure argument of adjective",
"app": "apposition",
"avc": "adverbial phrase component",
"cd": "coordinating conjunction",
"cj": "conjunct",
"cm": "comparative conjunction",
"cp": "complementizer",
"cvc": "collocational verb construction",
"da": "dative",
"dh": "discourse-level head",
"dm": "discourse marker",
"ep": "expletive es",
"hd": "head",
"ju": "junctor",
"mnr": "postnominal modifier",
"mo": "modifier",
"ng": "negation",
"nk": "noun kernel element",
"nmc": "numerical component",
"oa": "accusative object",
"oc": "clausal object",
"og": "genitive object",
"op": "prepositional object",
"par": "parenthetical element",
"pd": "predicate",
"pg": "phrasal genitive",
"ph": "placeholder",
"pm": "morphological particle",
"pnc": "proper noun component",
"rc": "relative clause",
"re": "repeated element",
"rs": "reported speech",
"sb": "subject",
"sb": "subject",
"sbp": "passivized subject (PP)",
"sp": "subject or predicate",
"svp": "separable verb prefix",
"uc": "unit component",
"vo": "vocative",
# Named Entity Recognition
# OntoNotes 5
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
"PERSON": "People, including fictional",
"NORP": "Nationalities or religious or political groups",
"FACILITY": "Buildings, airports, highways, bridges, etc.",
"FAC": "Buildings, airports, highways, bridges, etc.",
"ORG": "Companies, agencies, institutions, etc.",
"GPE": "Countries, cities, states",
"LOC": "Non-GPE locations, mountain ranges, bodies of water",
"PRODUCT": "Objects, vehicles, foods, etc. (not services)",
"EVENT": "Named hurricanes, battles, wars, sports events, etc.",
"WORK_OF_ART": "Titles of books, songs, etc.",
"LAW": "Named documents made into laws.",
"LANGUAGE": "Any named language",
"DATE": "Absolute or relative dates or periods",
"TIME": "Times smaller than a day",
"PERCENT": 'Percentage, including "%"',
"MONEY": "Monetary values, including unit",
"QUANTITY": "Measurements, as of weight or distance",
"ORDINAL": '"first", "second", etc.',
"CARDINAL": "Numerals that do not fall under another type",
# Named Entity Recognition
# Wikipedia
# http://www.sciencedirect.com/science/article/pii/S0004370212000276
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
"PER": "Named person or family.",
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
# https://github.com/ltgoslo/norne
"EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
"PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
"DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
"GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
"GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
}