From a04b5be1b2b4cc01c1c962294077a842ab1d0e59 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 17:01:53 +0200 Subject: [PATCH] Add glossary for annotation scheme (closes #1034) Can be imported as explain from spacy.glossary, or called as spacy.explain(term) --- spacy/__init__.py | 1 + spacy/glossary.py | 294 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 spacy/glossary.py diff --git a/spacy/__init__.py b/spacy/__init__.py index f5912e13e..2308ce7e4 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from . import util from .deprecated import resolve_model_name from .cli.info import info +from .glossary import explain from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja diff --git a/spacy/glossary.py b/spacy/glossary.py new file mode 100644 index 000000000..9df26a6af --- /dev/null +++ b/spacy/glossary.py @@ -0,0 +1,294 @@ +# coding: utf8 +from __future__ import unicode_literals + + +def explain(term): + if term in GLOSSARY: + return GLOSSARY[term] + + +GLOSSARY = { + # POS tags + # Universal POS Tags + # http://universaldependencies.org/u/pos/ + + 'ADJ': 'adjective', + 'ADP': 'adposition', + 'ADV': 'adverb', + 'AUX': 'auxiliary', + 'CONJ': 'conjunction', + 'CCONJ': 'coordinating conjunction', + 'DET': 'determiner', + 'INTJ': 'interjection', + 'NOUN': 'noun', + 'NUM': 'numeral', + 'PART': 'particle', + 'PRON': 'pronoun', + 'PROPN': 'proper noun', + 'PUNCT': 'punctuation', + 'SCONJ': 'subordinating conjunction', + 'SYM': 'symbol', + 'VERB': 'verb', + 'X': 'other', + 'EOL': 'end of line', + 'SPACE': 'space', + + + # POS tags (English) + # OntoNotes 5 / Penn Treebank + # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html + + '.': 'punctuation mark, sentence closer', + ',': 'punctuation mark, comma', + '-LRB-': 'left round bracket', + '-RRB-': 'right round bracket', + '``': 'opening quotation mark', + '""': 'closing quotation mark', + "''": 'closing quotation mark', + ':': 'punctuation mark, colon or ellipsis', + '$': 'symbol, currency', + '#': 'symbol, number sign', + 'AFX': 'affix', + 'CC': 'conjunction, coordinating', + 'CD': 'cardinal number', + 'DT': 'determiner', + 'EX': 'existential there', + 'FW': 'foreign word', + 'HYPH': 'punctuation mark, hyphen', + 'IN': 'conjunction, subordinating or preposition', + 'JJ': 'adjective', + 'JJR': 'adjective, comparative', + 'JJS': 'adjective, superlative', + 'LS': 'list item marker', + 'MD': 'verb, modal auxillary', + 'NIL': 'missing tag', + 'NN': 'noun, singular or mass', + 'NNP': 'noun, proper singular', + 'NNPS': 'noun, proper plural', + 'NNS': 'noun, plural', + 'PDT': 'predeterminer', + 'POS': 'possessive ending', + 'PRP': 'pronoun, personal', + 'PRP$': 'pronoun, possessive', + 'RB': 'adverb', + 'RBR': 'adverb, comparative', + 'RBS': 'adverb, superlative', + 'RP': 'adverb, particle', + 'TO': 'infinitival to', + 'UH': 'interjection', + 'VB': 'verb, base form', + 'VBD': 'verb, past tense', + 'VBG': 'verb, gerund or present participle', + 'VBN': 'verb, past participle', + 'VBP': 'verb, non-3rd person singular present', + 'VBZ': 'verb, 3rd person singular present', + 'WDT': 'wh-determiner', + 'WP': 'wh-pronoun, personal', + 'WP$': 'wh-pronoun, possessive', + 'WRB': 'wh-adverb', + 'SP': 'space', + 'ADD': 'email', + 'NFP': 'superfluous punctuation', + 'GW': 'additional word in multi-word expression', + 'XX': 'unknown', + 'BES': 'auxillary "be"', + 'HVS': 'forms of "have"', + + + # POS Tags (German) + # TIGER Treebank + # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf + + '$(': 'other sentence-internal punctuation mark', + '$,': 'comma', + '$.': 'sentence-final punctuation mark', + 'ADJA': 'adjective, attributive', + 'ADJD': 'adjective, adverbial or predicative', + 'APPO': 'postposition', + 'APRP': 'preposition; circumposition left', + 'APPRART': 'preposition with article', + 'APZR': 'circumposition right', + 'ART': 'definite or indefinite article', + 'CARD': 'cardinal number', + 'FM': 'foreign language material', + 'ITJ': 'interjection', + 'KOKOM': 'comparative conjunction ', + 'KON': 'coordinate conjunction', + 'KOUI': 'subordinate conjunction with "zu" and infinitive', + 'KOUS': 'subordinate conjunction with sentence', + 'NE': 'proper noun', + 'NNE': 'proper noun', + 'PAV': 'pronominal adverb', + 'PROAV': 'pronominal adverb', + 'PDAT': 'attributive demonstrative pronoun', + 'PDS': 'substituting demonstrative pronoun', + 'PIAT': 'attributive indefinite pronoun without determiner', + 'PIDAT': 'attributive indefinite pronoun with determiner', + 'PIS': 'substituting indefinite pronoun', + 'PPER': 'non-reflexive personal pronoun', + 'PPOSAT': 'attributive possessive pronoun', + 'PPOSS': 'substituting possessive pronoun', + 'PRELAT': 'attributive relative pronoun', + 'PRELS': 'substituting relative pronoun', + 'PRF': 'reflexive personal pronoun', + 'PTKA': 'particle with adjective or adverb', + 'PTKANT': 'answer particle', + 'PTKNEG': 'negative particle', + 'PTKVZ': 'separable verbal particle', + 'PTKZU': '"zu" before infinitive', + 'PWAT': 'attributive interrogative pronoun', + 'PWAV': 'adverbial interrogative or relative pronoun', + 'PWS': 'substituting interrogative pronoun', + 'TRUNC': 'word remnant', + 'VAFIN': 'finite verb, auxiliary', + 'VAIMP': 'imperative, auxiliary', + 'VAINF': 'infinitive, auxiliary', + 'VAPP': 'perfect participle, auxiliary', + 'VMFIN': 'finite verb, modal', + 'VMINF': 'infinitive, modal ', + 'VMPP': 'perfect participle, modal ', + 'VVFIN': 'finite verb, full', + 'VVIMP': 'imperative, full', + 'VVINF': 'infinitive, full', + 'VVIZU': 'infinitive with "zu", full', + 'VVPP': 'perfect participle, full', + 'XY': 'non-word containing non-letter', + + + # Noun chunks + + 'NP': 'noun phrase', + 'PP': 'prepositional phrase', + 'VP': 'verb phrase', + 'ADVP': 'adverb phrase', + 'ADJP': 'adjective phrase', + 'SBAR': 'subordinating conjunction', + 'PRT': 'particle', + 'PNP': 'prepositional noun phrase', + + + # Dependency Labels (English) + # ClearNLP / Universal Dependencies + # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md + + 'acomp': 'adjectival complement', + 'advcl': 'adverbial clause modifier', + 'advmod': 'adverbial modifier', + 'agent': 'agent', + 'amod': 'adjectival modifier', + 'appos': 'appositional modifier', + 'attr': 'attribute', + 'aux': 'auxiliary', + 'auxpass': 'auxiliary (passive)', + 'cc': 'coordinating conjunction', + 'ccomp': 'clausal complement', + 'complm': 'complementizer', + 'conj': 'conjunct', + 'cop': 'copula', + 'csubj': 'clausal subject', + 'csubjpass': 'clausal subject (passive)', + 'dep': 'unclassified dependent', + 'det': 'determiner', + 'dobj': 'direct object', + 'expl': 'expletive', + 'hmod': 'modifier in hyphenation', + 'hyph': 'hyphen', + 'infmod': 'infinitival modifier', + 'intj': 'interjection', + 'iobj': 'indirect object', + 'mark': 'marker', + 'meta': 'meta modifier', + 'neg': 'negation modifier', + 'nmod': 'modifier of nominal', + 'nn': 'noun compound modifier', + 'npadvmod': 'noun phrase as adverbial modifier', + 'nsubj': 'nominal subject', + 'nsubjpass': 'nominal subject (passive)', + 'num': 'number modifier', + 'number': 'number compound modifier', + 'oprd': 'object predicate', + 'obj': 'object', + 'obl': 'oblique nominal', + 'parataxis': 'parataxis', + 'partmod': 'participal modifier', + 'pcomp': 'complement of preposition', + 'pobj': 'object of preposition', + 'poss': 'possession modifier', + 'possessive': 'possessive modifier', + 'preconj': 'pre-correlative conjunction', + 'prep': 'prepositional modifier', + 'prt': 'particle', + 'punct': 'punctuation', + 'quantmod': 'modifier of quantifier', + 'rcmod': 'relative clause modifier', + 'root': 'root', + 'xcomp': 'open clausal complement', + + + # Dependency labels (German) + # TIGER Treebank + # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf + # currently missing: 'cc' (comparative complement) because of conflict + # with English labels + + 'ac': 'adpositional case marker', + 'adc': 'adjective component', + 'ag': 'genitive attribute', + 'ams': 'measure argument of adjective', + 'app': 'apposition', + 'avc': 'adverbial phrase component', + 'cd': 'coordinating conjunction', + 'cj': 'conjunct', + 'cm': 'comparative conjunction', + 'cp': 'complementizer', + 'cvc': 'collocational verb construction', + 'da': 'dative', + 'dh': 'discourse-level head', + 'dm': 'discourse marker', + 'ep': 'expletive es', + 'hd': 'head', + 'ju': 'junctor', + 'mnr': 'postnominal modifier', + 'mo': 'modifier', + 'ng': 'negation', + 'nk': 'noun kernel element', + 'nmc': 'numerical component', + 'oa': 'accusative object', + 'oa': 'second accusative object', + 'oc': 'clausal object', + 'og': 'genitive object', + 'op': 'prepositional object', + 'par': 'parenthetical element', + 'pd': 'predicate', + 'pg': 'phrasal genitive', + 'ph': 'placeholder', + 'pm': 'morphological particle', + 'pnc': 'proper noun component', + 'rc': 'relative clause', + 're': 'repeated element', + 'rs': 'reported speech', + 'sb': 'subject', + + + # Named Entity Recognition + # OntoNotes 5 + # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf + + 'PERSON': 'People, including fictional', + 'NORP': 'Nationalities or religious or political groups', + 'FACILITY': 'Buildings, airports, highways, bridges, etc.', + 'ORG': 'Companies, agencies, institutions, etc.', + 'GPE': 'Countries, cities, states', + 'LOC': 'Non-GPE locations, mountain ranges, bodies of water', + 'PRODUCT': 'Objects, vehicles, foods, etc. (not services)', + 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.', + 'WORK_OF_ART': 'Titles of books, songs, etc.', + 'LANGUAGE': 'Any named language', + 'DATE': 'Absolute or relative dates or periods', + 'TIME': 'Times smaller than a day', + 'PERCENT': 'Percentage, including "%"', + 'MONEY': 'Monetary values, including unit', + 'QUANTITY': 'Measurements, as of weight or distance', + 'ORDINAL': '"first", "second", etc.', + 'CARDINAL': 'Numerals that do not fall under another type' +}