From 3944c1d6e7e6a62824b4074545c59f183ad4479a Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 24 Oct 2017 15:51:05 +0200 Subject: [PATCH] Document lemmatizer --- website/api/_data.json | 4 +- website/api/lemmatizer.jade | 157 +++++++++++++++++++++++++++++++++++- 2 files changed, 159 insertions(+), 2 deletions(-) diff --git a/website/api/_data.json b/website/api/_data.json index d85b103dc..e9324e7e3 100644 --- a/website/api/_data.json +++ b/website/api/_data.json @@ -160,7 +160,9 @@ "lemmatizer": { "title": "Lemmatizer", - "tag": "class" + "teaser": "Assign the base forms of words.", + "tag": "class", + "source": "spacy/lemmatizer.py" }, "tagger": { diff --git a/website/api/lemmatizer.jade b/website/api/lemmatizer.jade index 9699395b1..eb061f10a 100644 --- a/website/api/lemmatizer.jade +++ b/website/api/lemmatizer.jade @@ -2,4 +2,159 @@ include ../_includes/_mixins -+under-construction +p + | The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix + | rules and lookup tables. + ++h(2, "init") Lemmatizer.__init__ + +tag method + +p Create a #[code Lemmatizer]. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + lemmatizer = Lemmatizer() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exceptions] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup] + +cell dict / #[code None] + +cell Lookup table mapping string to their lemmas. + + +row("foot") + +cell returns + +cell #[code Lemmatizer] + +cell The newly created object. + ++h(2, "call") Lemmatizer.__call__ + +tag method + +p Lemmatize a string. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES + lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) + lemmas = lemmatizer(u'ducks', u'NOUN') + assert lemmas == [u'duck'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to lemmatize, e.g. the token text. + + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict / #[code None] + +cell + | Morphological features following the + | #[+a("http://universaldependencies.org/") Universal Dependencies] + | scheme. + + +row("foot") + +cell returns + +cell list + +cell The available lemmas for the string. + ++h(2, "lookup") Lemmatizer.lookup + +tag method + +tag-new(2) + +p + | Look up a lemma in the lookup table, if available. If no lemma is found, + | the original string is returned. Languages can provide a + | #[+a("/usage/adding-languages#lemmatizer") lookup table] via the + | #[code lemma_lookup] variable, set on the individual #[code Language] + | class. + ++aside-code("Example"). + lookup = {u'going': u'go'} + lemmatizer = Lemmatizer(lookup=lookup) + assert lemmatizer.lookup(u'going') == u'go' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to look up. + + +row("foot") + +cell returns + +cell unicode + +cell The lemma if the string was found, otherwise the original string. + ++h(2, "is_base_form") Lemmatizer.is_base_form + +tag method + +p + | Check whether we're dealing with an uninflected paradigm, so we can + | avoid lemmatization entirely. + ++aside-code("Example"). + pos = 'verb' + morph = {'VerbForm': 'inf'} + is_base_form = lemmatizer.is_base_form(pos, morph) + assert is_base_form == True + ++table(["Name", "Type", "Description"]) + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict + +cell The token's morphological features. + + +row("foot") + +cell returns + +cell bool + +cell + | Whether the token's part-of-speech tag and morphological features + | describe a base form. + ++h(2, "attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exc] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup_table] + +tag-new(2) + +cell dict / #[code None] + +cell The lemma lookup table, if available.