From a04b5be1b2b4cc01c1c962294077a842ab1d0e59 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 3 May 2017 17:01:53 +0200
Subject: [PATCH] Add glossary for annotation scheme (closes #1034)

Can be imported as explain from spacy.glossary, or called as
spacy.explain(term)
---
 spacy/__init__.py |   1 +
 spacy/glossary.py | 294 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 295 insertions(+)
 create mode 100644 spacy/glossary.py

diff --git a/spacy/__init__.py b/spacy/__init__.py
index f5912e13e..2308ce7e4 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from . import util
 from .deprecated import resolve_model_name
 from .cli.info import info
+from .glossary import explain
 
 from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja
 
diff --git a/spacy/glossary.py b/spacy/glossary.py
new file mode 100644
index 000000000..9df26a6af
--- /dev/null
+++ b/spacy/glossary.py
@@ -0,0 +1,294 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+def explain(term):
+    if term in GLOSSARY:
+        return GLOSSARY[term]
+
+
+GLOSSARY = {
+    # POS tags
+    # Universal POS Tags
+    # http://universaldependencies.org/u/pos/
+
+    'ADJ':          'adjective',
+    'ADP':          'adposition',
+    'ADV':          'adverb',
+    'AUX':          'auxiliary',
+    'CONJ':         'conjunction',
+    'CCONJ':        'coordinating conjunction',
+    'DET':          'determiner',
+    'INTJ':         'interjection',
+    'NOUN':         'noun',
+    'NUM':          'numeral',
+    'PART':         'particle',
+    'PRON':         'pronoun',
+    'PROPN':        'proper noun',
+    'PUNCT':        'punctuation',
+    'SCONJ':        'subordinating conjunction',
+    'SYM':          'symbol',
+    'VERB':         'verb',
+    'X':            'other',
+    'EOL':          'end of line',
+    'SPACE':        'space',
+
+
+    # POS tags (English)
+    # OntoNotes 5 / Penn Treebank
+    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
+
+    '.':            'punctuation mark, sentence closer',
+    ',':            'punctuation mark, comma',
+    '-LRB-':        'left round bracket',
+    '-RRB-':        'right round bracket',
+    '``':           'opening quotation mark',
+    '""':           'closing quotation mark',
+    "''":           'closing quotation mark',
+    ':':            'punctuation mark, colon or ellipsis',
+    '$':            'symbol, currency',
+    '#':            'symbol, number sign',
+    'AFX':          'affix',
+    'CC':           'conjunction, coordinating',
+    'CD':           'cardinal number',
+    'DT':           'determiner',
+    'EX':           'existential there',
+    'FW':           'foreign word',
+    'HYPH':         'punctuation mark, hyphen',
+    'IN':           'conjunction, subordinating or preposition',
+    'JJ':           'adjective',
+    'JJR':          'adjective, comparative',
+    'JJS':          'adjective, superlative',
+    'LS':           'list item marker',
+    'MD':           'verb, modal auxillary',
+    'NIL':          'missing tag',
+    'NN':           'noun, singular or mass',
+    'NNP':          'noun, proper singular',
+    'NNPS':         'noun, proper plural',
+    'NNS':          'noun, plural',
+    'PDT':          'predeterminer',
+    'POS':          'possessive ending',
+    'PRP':          'pronoun, personal',
+    'PRP$':         'pronoun, possessive',
+    'RB':           'adverb',
+    'RBR':          'adverb, comparative',
+    'RBS':          'adverb, superlative',
+    'RP':           'adverb, particle',
+    'TO':           'infinitival to',
+    'UH':           'interjection',
+    'VB':           'verb, base form',
+    'VBD':          'verb, past tense',
+    'VBG':          'verb, gerund or present participle',
+    'VBN':          'verb, past participle',
+    'VBP':          'verb, non-3rd person singular present',
+    'VBZ':          'verb, 3rd person singular present',
+    'WDT':          'wh-determiner',
+    'WP':           'wh-pronoun, personal',
+    'WP$':          'wh-pronoun, possessive',
+    'WRB':          'wh-adverb',
+    'SP':           'space',
+    'ADD':          'email',
+    'NFP':          'superfluous punctuation',
+    'GW':           'additional word in multi-word expression',
+    'XX':           'unknown',
+    'BES':          'auxillary "be"',
+    'HVS':          'forms of "have"',
+
+
+    # POS Tags (German)
+    # TIGER Treebank
+    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
+
+    '$(':           'other sentence-internal punctuation mark',
+    '$,':           'comma',
+    '$.':           'sentence-final punctuation mark',
+    'ADJA':         'adjective, attributive',
+    'ADJD':         'adjective, adverbial or predicative',
+    'APPO':         'postposition',
+    'APRP':         'preposition; circumposition left',
+    'APPRART':      'preposition with article',
+    'APZR':         'circumposition right',
+    'ART':          'definite or indefinite article',
+    'CARD':         'cardinal number',
+    'FM':           'foreign language material',
+    'ITJ':          'interjection',
+    'KOKOM':        'comparative conjunction ',
+    'KON':          'coordinate conjunction',
+    'KOUI':         'subordinate conjunction with "zu" and infinitive',
+    'KOUS':         'subordinate conjunction with sentence',
+    'NE':           'proper noun',
+    'NNE':          'proper noun',
+    'PAV':          'pronominal adverb',
+    'PROAV':        'pronominal adverb',
+    'PDAT':         'attributive demonstrative pronoun',
+    'PDS':          'substituting demonstrative pronoun',
+    'PIAT':         'attributive indefinite pronoun without determiner',
+    'PIDAT':        'attributive indefinite pronoun with determiner',
+    'PIS':          'substituting indefinite pronoun',
+    'PPER':         'non-reflexive personal pronoun',
+    'PPOSAT':       'attributive possessive pronoun',
+    'PPOSS':        'substituting possessive pronoun',
+    'PRELAT':       'attributive relative pronoun',
+    'PRELS':        'substituting relative pronoun',
+    'PRF':          'reflexive personal pronoun',
+    'PTKA':         'particle with adjective or adverb',
+    'PTKANT':       'answer particle',
+    'PTKNEG':       'negative particle',
+    'PTKVZ':        'separable verbal particle',
+    'PTKZU':        '"zu" before infinitive',
+    'PWAT':         'attributive interrogative pronoun',
+    'PWAV':         'adverbial interrogative or relative pronoun',
+    'PWS':          'substituting interrogative pronoun',
+    'TRUNC':        'word remnant',
+    'VAFIN':        'finite verb, auxiliary',
+    'VAIMP':        'imperative, auxiliary',
+    'VAINF':        'infinitive, auxiliary',
+    'VAPP':         'perfect participle, auxiliary',
+    'VMFIN':        'finite verb, modal',
+    'VMINF':        'infinitive, modal ',
+    'VMPP':         'perfect participle, modal ',
+    'VVFIN':        'finite verb, full',
+    'VVIMP':        'imperative, full',
+    'VVINF':        'infinitive, full',
+    'VVIZU':        'infinitive with "zu", full',
+    'VVPP':         'perfect participle, full',
+    'XY':           'non-word containing non-letter',
+
+
+    # Noun chunks
+
+    'NP':           'noun phrase',
+    'PP':           'prepositional phrase',
+    'VP':           'verb phrase',
+    'ADVP':         'adverb phrase',
+    'ADJP':         'adjective phrase',
+    'SBAR':         'subordinating conjunction',
+    'PRT':          'particle',
+    'PNP':          'prepositional noun phrase',
+
+
+    # Dependency Labels (English)
+    # ClearNLP / Universal Dependencies
+    # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md
+
+    'acomp':        'adjectival complement',
+    'advcl':        'adverbial clause modifier',
+    'advmod':       'adverbial modifier',
+    'agent':        'agent',
+    'amod':         'adjectival modifier',
+    'appos':        'appositional modifier',
+    'attr':         'attribute',
+    'aux':          'auxiliary',
+    'auxpass':      'auxiliary (passive)',
+    'cc':           'coordinating conjunction',
+    'ccomp':        'clausal complement',
+    'complm':       'complementizer',
+    'conj':         'conjunct',
+    'cop':          'copula',
+    'csubj':        'clausal subject',
+    'csubjpass':    'clausal subject (passive)',
+    'dep':          'unclassified dependent',
+    'det':          'determiner',
+    'dobj':         'direct object',
+    'expl':         'expletive',
+    'hmod':         'modifier in hyphenation',
+    'hyph':         'hyphen',
+    'infmod':       'infinitival modifier',
+    'intj':         'interjection',
+    'iobj':         'indirect object',
+    'mark':         'marker',
+    'meta':         'meta modifier',
+    'neg':          'negation modifier',
+    'nmod':         'modifier of nominal',
+    'nn':           'noun compound modifier',
+    'npadvmod':     'noun phrase as adverbial modifier',
+    'nsubj':        'nominal subject',
+    'nsubjpass':    'nominal subject (passive)',
+    'num':          'number modifier',
+    'number':       'number compound modifier',
+    'oprd':         'object predicate',
+    'obj':          'object',
+    'obl':          'oblique nominal',
+    'parataxis':    'parataxis',
+    'partmod':      'participal modifier',
+    'pcomp':        'complement of preposition',
+    'pobj':         'object of preposition',
+    'poss':         'possession modifier',
+    'possessive':   'possessive modifier',
+    'preconj':      'pre-correlative conjunction',
+    'prep':         'prepositional modifier',
+    'prt':          'particle',
+    'punct':        'punctuation',
+    'quantmod':     'modifier of quantifier',
+    'rcmod':        'relative clause modifier',
+    'root':         'root',
+    'xcomp':        'open clausal complement',
+
+
+    # Dependency labels (German)
+    # TIGER Treebank
+    # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf
+    # currently missing: 'cc' (comparative complement) because of conflict
+    # with English labels
+
+    'ac':           'adpositional case marker',
+    'adc':          'adjective component',
+    'ag':           'genitive attribute',
+    'ams':          'measure argument of adjective',
+    'app':          'apposition',
+    'avc':          'adverbial phrase component',
+    'cd':           'coordinating conjunction',
+    'cj':           'conjunct',
+    'cm':           'comparative conjunction',
+    'cp':           'complementizer',
+    'cvc':          'collocational verb construction',
+    'da':           'dative',
+    'dh':           'discourse-level head',
+    'dm':           'discourse marker',
+    'ep':           'expletive es',
+    'hd':           'head',
+    'ju':           'junctor',
+    'mnr':          'postnominal modifier',
+    'mo':           'modifier',
+    'ng':           'negation',
+    'nk':           'noun kernel element',
+    'nmc':          'numerical component',
+    'oa':           'accusative object',
+    'oa':           'second accusative object',
+    'oc':           'clausal object',
+    'og':           'genitive object',
+    'op':           'prepositional object',
+    'par':          'parenthetical element',
+    'pd':           'predicate',
+    'pg':           'phrasal genitive',
+    'ph':           'placeholder',
+    'pm':           'morphological particle',
+    'pnc':          'proper noun component',
+    'rc':           'relative clause',
+    're':           'repeated element',
+    'rs':           'reported speech',
+    'sb':           'subject',
+
+
+    # Named Entity Recognition
+    # OntoNotes 5
+    # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
+
+    'PERSON':       'People, including fictional',
+    'NORP':         'Nationalities or religious or political groups',
+    'FACILITY':     'Buildings, airports, highways, bridges, etc.',
+    'ORG':          'Companies, agencies, institutions, etc.',
+    'GPE':          'Countries, cities, states',
+    'LOC':          'Non-GPE locations, mountain ranges, bodies of water',
+    'PRODUCT':      'Objects, vehicles, foods, etc. (not services)',
+    'EVENT':        'Named hurricanes, battles, wars, sports events, etc.',
+    'WORK_OF_ART':  'Titles of books, songs, etc.',
+    'LANGUAGE':     'Any named language',
+    'DATE':         'Absolute or relative dates or periods',
+    'TIME':         'Times smaller than a day',
+    'PERCENT':      'Percentage, including "%"',
+    'MONEY':        'Monetary values, including unit',
+    'QUANTITY':     'Measurements, as of weight or distance',
+    'ORDINAL':      '"first", "second", etc.',
+    'CARDINAL':     'Numerals that do not fall under another type'
+}