Document lemmatizer

This commit is contained in:
ines 2017-10-24 15:51:05 +02:00
parent c9dc88ddfc
commit 3944c1d6e7
2 changed files with 159 additions and 2 deletions

View File

@ -160,7 +160,9 @@
"lemmatizer": { "lemmatizer": {
"title": "Lemmatizer", "title": "Lemmatizer",
"tag": "class" "teaser": "Assign the base forms of words.",
"tag": "class",
"source": "spacy/lemmatizer.py"
}, },
"tagger": { "tagger": {

View File

@ -2,4 +2,159 @@
include ../_includes/_mixins include ../_includes/_mixins
+under-construction p
| The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
| rules and lookup tables.
+h(2, "init") Lemmatizer.__init__
+tag method
p Create a #[code Lemmatizer].
+aside-code("Example").
from spacy.lemmatizer import Lemmatizer
lemmatizer = Lemmatizer()
+table(["Name", "Type", "Description"])
+row
+cell #[code index]
+cell dict / #[code None]
+cell Inventory of lemmas in the language.
+row
+cell #[code exceptions]
+cell dict / #[code None]
+cell Mapping of string forms to lemmas that bypass the #[code rules].
+row
+cell #[code rules]
+cell dict / #[code None]
+cell List of suffix rewrite rules.
+row
+cell #[code lookup]
+cell dict / #[code None]
+cell Lookup table mapping string to their lemmas.
+row("foot")
+cell returns
+cell #[code Lemmatizer]
+cell The newly created object.
+h(2, "call") Lemmatizer.__call__
+tag method
p Lemmatize a string.
+aside-code("Example").
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
lemmas = lemmatizer(u'ducks', u'NOUN')
assert lemmas == [u'duck']
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to lemmatize, e.g. the token text.
+row
+cell #[code univ_pos]
+cell unicode / int
+cell The token's universal part-of-speech tag.
+row
+cell #[code morphology]
+cell dict / #[code None]
+cell
| Morphological features following the
| #[+a("http://universaldependencies.org/") Universal Dependencies]
| scheme.
+row("foot")
+cell returns
+cell list
+cell The available lemmas for the string.
+h(2, "lookup") Lemmatizer.lookup
+tag method
+tag-new(2)
p
| Look up a lemma in the lookup table, if available. If no lemma is found,
| the original string is returned. Languages can provide a
| #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
| #[code lemma_lookup] variable, set on the individual #[code Language]
| class.
+aside-code("Example").
lookup = {u'going': u'go'}
lemmatizer = Lemmatizer(lookup=lookup)
assert lemmatizer.lookup(u'going') == u'go'
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to look up.
+row("foot")
+cell returns
+cell unicode
+cell The lemma if the string was found, otherwise the original string.
+h(2, "is_base_form") Lemmatizer.is_base_form
+tag method
p
| Check whether we're dealing with an uninflected paradigm, so we can
| avoid lemmatization entirely.
+aside-code("Example").
pos = 'verb'
morph = {'VerbForm': 'inf'}
is_base_form = lemmatizer.is_base_form(pos, morph)
assert is_base_form == True
+table(["Name", "Type", "Description"])
+row
+cell #[code univ_pos]
+cell unicode / int
+cell The token's universal part-of-speech tag.
+row
+cell #[code morphology]
+cell dict
+cell The token's morphological features.
+row("foot")
+cell returns
+cell bool
+cell
| Whether the token's part-of-speech tag and morphological features
| describe a base form.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code index]
+cell dict / #[code None]
+cell Inventory of lemmas in the language.
+row
+cell #[code exc]
+cell dict / #[code None]
+cell Mapping of string forms to lemmas that bypass the #[code rules].
+row
+cell #[code rules]
+cell dict / #[code None]
+cell List of suffix rewrite rules.
+row
+cell #[code lookup_table]
+tag-new(2)
+cell dict / #[code None]
+cell The lemma lookup table, if available.