From 3944c1d6e7e6a62824b4074545c59f183ad4479a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Tue, 24 Oct 2017 15:51:05 +0200
Subject: [PATCH] Document lemmatizer

---
 website/api/_data.json      |   4 +-
 website/api/lemmatizer.jade | 157 +++++++++++++++++++++++++++++++++++-
 2 files changed, 159 insertions(+), 2 deletions(-)

diff --git a/website/api/_data.json b/website/api/_data.json
index d85b103dc..e9324e7e3 100644
--- a/website/api/_data.json
+++ b/website/api/_data.json
@@ -160,7 +160,9 @@
 
     "lemmatizer": {
         "title": "Lemmatizer",
-        "tag": "class"
+        "teaser": "Assign the base forms of words.",
+        "tag": "class",
+        "source": "spacy/lemmatizer.py"
     },
 
     "tagger": {
diff --git a/website/api/lemmatizer.jade b/website/api/lemmatizer.jade
index 9699395b1..eb061f10a 100644
--- a/website/api/lemmatizer.jade
+++ b/website/api/lemmatizer.jade
@@ -2,4 +2,159 @@
 
 include ../_includes/_mixins
 
-+under-construction
+p
+    |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
+    |  rules and lookup tables.
+
++h(2, "init") Lemmatizer.__init__
+    +tag method
+
+p Create a #[code Lemmatizer].
+
++aside-code("Example").
+    from spacy.lemmatizer import Lemmatizer
+    lemmatizer = Lemmatizer()
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code index]
+        +cell dict / #[code None]
+        +cell Inventory of lemmas in the language.
+
+    +row
+        +cell #[code exceptions]
+        +cell dict / #[code None]
+        +cell Mapping of string forms to lemmas that bypass the #[code rules].
+
+    +row
+        +cell #[code rules]
+        +cell dict / #[code None]
+        +cell List of suffix rewrite rules.
+
+    +row
+        +cell #[code lookup]
+        +cell dict / #[code None]
+        +cell Lookup table mapping string to their lemmas.
+
+    +row("foot")
+        +cell returns
+        +cell #[code Lemmatizer]
+        +cell The newly created object.
+
++h(2, "call") Lemmatizer.__call__
+    +tag method
+
+p Lemmatize a string.
+
++aside-code("Example").
+    from spacy.lemmatizer import Lemmatizer
+    from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
+    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
+    lemmas = lemmatizer(u'ducks', u'NOUN')
+    assert lemmas == [u'duck']
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to lemmatize, e.g. the token text.
+
+    +row
+        +cell #[code univ_pos]
+        +cell unicode / int
+        +cell The token's universal part-of-speech tag.
+
+    +row
+        +cell #[code morphology]
+        +cell dict / #[code None]
+        +cell
+            |  Morphological features following the
+            |  #[+a("http://universaldependencies.org/") Universal Dependencies]
+            |  scheme.
+
+    +row("foot")
+        +cell returns
+        +cell list
+        +cell The available lemmas for the string.
+
++h(2, "lookup") Lemmatizer.lookup
+    +tag method
+    +tag-new(2)
+
+p
+    |  Look up a lemma in the lookup table, if available. If no lemma is found,
+    |  the original string is returned. Languages can provide a
+    |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
+    |  #[code lemma_lookup] variable, set on the individual #[code Language]
+    |  class.
+
++aside-code("Example").
+    lookup = {u'going': u'go'}
+    lemmatizer = Lemmatizer(lookup=lookup)
+    assert lemmatizer.lookup(u'going') == u'go'
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code string]
+        +cell unicode
+        +cell The string to look up.
+
+    +row("foot")
+        +cell returns
+        +cell unicode
+        +cell The lemma if the string was found, otherwise the original string.
+
++h(2, "is_base_form") Lemmatizer.is_base_form
+    +tag method
+
+p
+    |  Check whether we're dealing with an uninflected paradigm, so we can
+    |  avoid lemmatization entirely.
+
++aside-code("Example").
+    pos = 'verb'
+    morph = {'VerbForm': 'inf'}
+    is_base_form = lemmatizer.is_base_form(pos, morph)
+    assert is_base_form == True
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code univ_pos]
+        +cell unicode / int
+        +cell The token's universal part-of-speech tag.
+
+    +row
+        +cell #[code morphology]
+        +cell dict
+        +cell The token's morphological features.
+
+    +row("foot")
+        +cell returns
+        +cell bool
+        +cell
+            |  Whether the token's part-of-speech tag and morphological features
+            |  describe a base form.
+
++h(2, "attributes") Attributes
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code index]
+        +cell dict / #[code None]
+        +cell Inventory of lemmas in the language.
+
+    +row
+        +cell #[code exc]
+        +cell dict / #[code None]
+        +cell Mapping of string forms to lemmas that bypass the #[code rules].
+
+    +row
+        +cell #[code rules]
+        +cell dict / #[code None]
+        +cell List of suffix rewrite rules.
+
+    +row
+        +cell #[code lookup_table]
+            +tag-new(2)
+        +cell dict / #[code None]
+        +cell The lemma lookup table, if available.