spaCy/spacy/glossary.py

import warnings

from .errors import Warnings


def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.

    term (str): The term to explain.
    RETURNS (str): The explanation, or `None` if not found in the glossary.

    EXAMPLE:
        >>> spacy.explain(u'NORP')
        >>> doc = nlp(u'Hello world')
        >>> print([w.text, w.tag_, spacy.explain(w.tag_) for w in doc])
    """
    if term in GLOSSARY:
        return GLOSSARY[term]
    else:
        warnings.warn(Warnings.W118.format(term=term))


GLOSSARY = {
    # POS tags
    # Universal POS Tags
    # http://universaldependencies.org/u/pos/
    "ADJ": "adjective",
    "ADP": "adposition",
    "ADV": "adverb",
    "AUX": "auxiliary",
    "CONJ": "conjunction",
    "CCONJ": "coordinating conjunction",
    "DET": "determiner",
    "INTJ": "interjection",
    "NOUN": "noun",
    "NUM": "numeral",
    "PART": "particle",
    "PRON": "pronoun",
    "PROPN": "proper noun",
    "PUNCT": "punctuation",
    "SCONJ": "subordinating conjunction",
    "SYM": "symbol",
    "VERB": "verb",
    "X": "other",
    "EOL": "end of line",
    "SPACE": "space",
    # POS tags (English)
    # OntoNotes 5 / Penn Treebank
    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    ".": "punctuation mark, sentence closer",
    ",": "punctuation mark, comma",
    "-LRB-": "left round bracket",
    "-RRB-": "right round bracket",
    "``": "opening quotation mark",
    '""': "closing quotation mark",
    "''": "closing quotation mark",
    ":": "punctuation mark, colon or ellipsis",
    "$": "symbol, currency",
    "#": "symbol, number sign",
    "AFX": "affix",
    "CC": "conjunction, coordinating",
    "CD": "cardinal number",
    "DT": "determiner",
    "EX": "existential there",
    "FW": "foreign word",
    "HYPH": "punctuation mark, hyphen",
    "IN": "conjunction, subordinating or preposition",
    "JJ": "adjective (English), other noun-modifier (Chinese)",
    "JJR": "adjective, comparative",
    "JJS": "adjective, superlative",
    "LS": "list item marker",
    "MD": "verb, modal auxiliary",
    "NIL": "missing tag",
    "NN": "noun, singular or mass",
    "NNP": "noun, proper singular",
    "NNPS": "noun, proper plural",
    "NNS": "noun, plural",
    "PDT": "predeterminer",
    "POS": "possessive ending",
    "PRP": "pronoun, personal",
    "PRP$": "pronoun, possessive",
    "RB": "adverb",
    "RBR": "adverb, comparative",
    "RBS": "adverb, superlative",
    "RP": "adverb, particle",
    "TO": 'infinitival "to"',
    "UH": "interjection",
    "VB": "verb, base form",
    "VBD": "verb, past tense",
    "VBG": "verb, gerund or present participle",
    "VBN": "verb, past participle",
    "VBP": "verb, non-3rd person singular present",
    "VBZ": "verb, 3rd person singular present",
    "WDT": "wh-determiner",
    "WP": "wh-pronoun, personal",
    "WP$": "wh-pronoun, possessive",
    "WRB": "wh-adverb",
    "SP": "space (English), sentence-final particle (Chinese)",
    "ADD": "email",
    "NFP": "superfluous punctuation",
    "GW": "additional word in multi-word expression",
    "XX": "unknown",
    "BES": 'auxiliary "be"',
    "HVS": 'forms of "have"',
    "_SP": "whitespace",
    # POS Tags (German)
    # TIGER Treebank
    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
    "$(": "other sentence-internal punctuation mark",
    "$,": "comma",
    "$.": "sentence-final punctuation mark",
    "ADJA": "adjective, attributive",
    "ADJD": "adjective, adverbial or predicative",
    "APPO": "postposition",
    "APPR": "preposition; circumposition left",
    "APPRART": "preposition with article",
    "APZR": "circumposition right",
    "ART": "definite or indefinite article",
    "CARD": "cardinal number",
    "FM": "foreign language material",
    "ITJ": "interjection",
    "KOKOM": "comparative conjunction",
    "KON": "coordinate conjunction",
    "KOUI": 'subordinate conjunction with "zu" and infinitive',
    "KOUS": "subordinate conjunction with sentence",
    "NE": "proper noun",
    "NNE": "proper noun",
    "PAV": "pronominal adverb",
    "PROAV": "pronominal adverb",
    "PDAT": "attributive demonstrative pronoun",
    "PDS": "substituting demonstrative pronoun",
    "PIAT": "attributive indefinite pronoun without determiner",
    "PIDAT": "attributive indefinite pronoun with determiner",
    "PIS": "substituting indefinite pronoun",
    "PPER": "non-reflexive personal pronoun",
    "PPOSAT": "attributive possessive pronoun",
    "PPOSS": "substituting possessive pronoun",
    "PRELAT": "attributive relative pronoun",
    "PRELS": "substituting relative pronoun",
    "PRF": "reflexive personal pronoun",
    "PTKA": "particle with adjective or adverb",
    "PTKANT": "answer particle",
    "PTKNEG": "negative particle",
    "PTKVZ": "separable verbal particle",
    "PTKZU": '"zu" before infinitive',
    "PWAT": "attributive interrogative pronoun",
    "PWAV": "adverbial interrogative or relative pronoun",
    "PWS": "substituting interrogative pronoun",
    "TRUNC": "word remnant",
    "VAFIN": "finite verb, auxiliary",
    "VAIMP": "imperative, auxiliary",
    "VAINF": "infinitive, auxiliary",
    "VAPP": "perfect participle, auxiliary",
    "VMFIN": "finite verb, modal",
    "VMINF": "infinitive, modal",
    "VMPP": "perfect participle, modal",
    "VVFIN": "finite verb, full",
    "VVIMP": "imperative, full",
    "VVINF": "infinitive, full",
    "VVIZU": 'infinitive with "zu", full',
    "VVPP": "perfect participle, full",
    "XY": "non-word containing non-letter",
    # POS Tags (Chinese)
    # OntoNotes / Chinese Penn Treebank
    # https://repository.upenn.edu/cgi/viewcontent.cgi?article=1039&context=ircs_reports
    "AD": "adverb",
    "AS": "aspect marker",
    "BA": "把 in ba-construction",
    # "CD": "cardinal number",
    "CS": "subordinating conjunction",
    "DEC": "的 in a relative clause",
    "DEG": "associative 的",
    "DER": "得 in V-de const. and V-de-R",
    "DEV": "地 before VP",
    "ETC": "for words 等, 等等",
    # "FW": "foreign words"
    "IJ": "interjection",
    # "JJ": "other noun-modifier",
    "LB": "被 in long bei-const",
    "LC": "localizer",
    "M": "measure word",
    "MSP": "other particle",
    # "NN": "common noun",
    "NR": "proper noun",
    "NT": "temporal noun",
    "OD": "ordinal number",
    "ON": "onomatopoeia",
    "P": "preposition excluding 把 and 被",
    "PN": "pronoun",
    "PU": "punctuation",
    "SB": "被 in short bei-const",
    # "SP": "sentence-final particle",
    "VA": "predicative adjective",
    "VC": "是 (copula)",
    "VE": "有 as the main verb",
    "VV": "other verb",
    # Noun chunks
    "NP": "noun phrase",
    "PP": "prepositional phrase",
    "VP": "verb phrase",
    "ADVP": "adverb phrase",
    "ADJP": "adjective phrase",
    "SBAR": "subordinating conjunction",
    "PRT": "particle",
    "PNP": "prepositional noun phrase",
    # Dependency Labels (English)
    # ClearNLP / Universal Dependencies
    # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
    "acl": "clausal modifier of noun (adjectival clause)",
    "acomp": "adjectival complement",
    "advcl": "adverbial clause modifier",
    "advmod": "adverbial modifier",
    "agent": "agent",
    "amod": "adjectival modifier",
    "appos": "appositional modifier",
    "attr": "attribute",
    "aux": "auxiliary",
    "auxpass": "auxiliary (passive)",
    "case": "case marking",
    "cc": "coordinating conjunction",
    "ccomp": "clausal complement",
    "clf": "classifier",
    "complm": "complementizer",
    "compound": "compound",
    "conj": "conjunct",
    "cop": "copula",
    "csubj": "clausal subject",
    "csubjpass": "clausal subject (passive)",
    "dative": "dative",
    "dep": "unclassified dependent",
    "det": "determiner",
    "discourse": "discourse element",
    "dislocated": "dislocated elements",
    "dobj": "direct object",
    "expl": "expletive",
    "fixed": "fixed multiword expression",
    "flat": "flat multiword expression",
    "goeswith": "goes with",
    "hmod": "modifier in hyphenation",
    "hyph": "hyphen",
    "infmod": "infinitival modifier",
    "intj": "interjection",
    "iobj": "indirect object",
    "list": "list",
    "mark": "marker",
    "meta": "meta modifier",
    "neg": "negation modifier",
    "nmod": "modifier of nominal",
    "nn": "noun compound modifier",
    "npadvmod": "noun phrase as adverbial modifier",
    "nsubj": "nominal subject",
    "nsubjpass": "nominal subject (passive)",
    "nounmod": "modifier of nominal",
    "npmod": "noun phrase as adverbial modifier",
    "num": "number modifier",
    "number": "number compound modifier",
    "nummod": "numeric modifier",
    "oprd": "object predicate",
    "obj": "object",
    "obl": "oblique nominal",
    "orphan": "orphan",
    "parataxis": "parataxis",
    "partmod": "participal modifier",
    "pcomp": "complement of preposition",
    "pobj": "object of preposition",
    "poss": "possession modifier",
    "possessive": "possessive modifier",
    "preconj": "pre-correlative conjunction",
    "prep": "prepositional modifier",
    "prt": "particle",
    "punct": "punctuation",
    "quantmod": "modifier of quantifier",
    "rcmod": "relative clause modifier",
    "relcl": "relative clause modifier",
    "reparandum": "overridden disfluency",
    "root": "root",
    "ROOT": "root",
    "vocative": "vocative",
    "xcomp": "open clausal complement",
    # Dependency labels (German)
    # TIGER Treebank
    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
    # currently missing: 'cc' (comparative complement) because of conflict
    # with English labels
    "ac": "adpositional case marker",
    "adc": "adjective component",
    "ag": "genitive attribute",
    "ams": "measure argument of adjective",
    "app": "apposition",
    "avc": "adverbial phrase component",
    "cd": "coordinating conjunction",
    "cj": "conjunct",
    "cm": "comparative conjunction",
    "cp": "complementizer",
    "cvc": "collocational verb construction",
    "da": "dative",
    "dh": "discourse-level head",
    "dm": "discourse marker",
    "ep": "expletive es",
    "hd": "head",
    "ju": "junctor",
    "mnr": "postnominal modifier",
    "mo": "modifier",
    "ng": "negation",
    "nk": "noun kernel element",
    "nmc": "numerical component",
    "oa": "accusative object",
    "oc": "clausal object",
    "og": "genitive object",
    "op": "prepositional object",
    "par": "parenthetical element",
    "pd": "predicate",
    "pg": "phrasal genitive",
    "ph": "placeholder",
    "pm": "morphological particle",
    "pnc": "proper noun component",
    "rc": "relative clause",
    "re": "repeated element",
    "rs": "reported speech",
    "sb": "subject",
    "sbp": "passivized subject (PP)",
    "sp": "subject or predicate",
    "svp": "separable verb prefix",
    "uc": "unit component",
    "vo": "vocative",
    # Named Entity Recognition
    # OntoNotes 5
    # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
    "PERSON": "People, including fictional",
    "NORP": "Nationalities or religious or political groups",
    "FACILITY": "Buildings, airports, highways, bridges, etc.",
    "FAC": "Buildings, airports, highways, bridges, etc.",
    "ORG": "Companies, agencies, institutions, etc.",
    "GPE": "Countries, cities, states",
    "LOC": "Non-GPE locations, mountain ranges, bodies of water",
    "PRODUCT": "Objects, vehicles, foods, etc. (not services)",
    "EVENT": "Named hurricanes, battles, wars, sports events, etc.",
    "WORK_OF_ART": "Titles of books, songs, etc.",
    "LAW": "Named documents made into laws.",
    "LANGUAGE": "Any named language",
    "DATE": "Absolute or relative dates or periods",
    "TIME": "Times smaller than a day",
    "PERCENT": 'Percentage, including "%"',
    "MONEY": "Monetary values, including unit",
    "QUANTITY": "Measurements, as of weight or distance",
    "ORDINAL": '"first", "second", etc.',
    "CARDINAL": "Numerals that do not fall under another type",
    # Named Entity Recognition
    # Wikipedia
    # http://www.sciencedirect.com/science/article/pii/S0004370212000276
    # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
    "PER": "Named person or family.",
    "MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
    # https://github.com/ltgoslo/norne
    "EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
    "PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
    "DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
    "GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
    "GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
}