From 0095d5322b6b5f19cba6f5f35d6abd77a14cf69a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 13 May 2017 18:54:10 +0200
Subject: [PATCH] Update adding languages docs

---
 website/docs/usage/adding-languages.jade | 57 ++++++++++++++++++++----
 1 file changed, 49 insertions(+), 8 deletions(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index 12bddb72f..ba49d1019 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -105,35 +105,35 @@ p
 
 +table(["File name", "Variables", "Description"])
     +row
-        +cell #[+src(gh()) stop_words.py]
+        +cell #[+src(gh("spacy-dev-resources", "templates/new_language/stop_words.py")) stop_words.py]
         +cell #[code STOP_WORDS] (set)
         +cell
             |  List of most common words. Matching tokens will return #[code True]
             |  for #[code is_stop].
 
     +row
-        +cell #[+src(gh()) tokenizer_exceptions.py]
+        +cell #[+src(gh("spacy-dev-resources", "templates/new_language/tokenizer_exceptions.py")) tokenizer_exceptions.py]
         +cell #[code TOKENIZER_EXCEPTIONS] (dict), #[code TOKEN_MATCH] (regex)
         +cell
             |  Special-case rules for the tokenizer, for example, contractions
             |  and abbreviations containing punctuation.
 
     +row
-        +cell #[+src(gh()) punctuation.py]
+        +cell #[+src(gh("spaCy", "spacy/lang/punctuation.py")) punctuation.py]
         +cell
             |  #[code TOKENIZER_PREFIXES], #[code TOKENIZER_SUFFIXES],
             |  #[code TOKENIZER_INFIXES] (dicts)
         +cell Regular expressions for splitting tokens, e.g. on punctuation.
 
     +row
-        +cell #[+src(gh()) lex_attrs.py]
+        +cell #[+src(gh("spacy-dev-resources", "templates/new_language/lex_attrs.py")) lex_attrs.py]
         +cell #[code LEX_ATTRS] (dict)
         +cell
             |  Functions for setting lexical attributes on tokens, e.g.
             |  #[code is_punct] or #[code like_num].
 
     +row
-        +cell #[+src(gh()) tag_map.py]
+        +cell #[+src(gh("spacy-dev-resources", "templates/new_language/tag_map.py")) tag_map.py]
         +cell #[code TAG_MAP] (dict)
         +cell
             |  Dictionary mapping strings in your tag set to
@@ -143,10 +143,10 @@ p
     +row
         +cell #[+src(gh()) morph_rules.py]
         +cell #[code MORPH_RULES] (dict)
-        +cell
+        +cell Exception rules for morphological analysis of irregular words.
 
     +row
-        +cell #[+src(gh()) lemmatizer.py]
+        +cell #[+src(gh("spacy-dev-resources", "templates/new_language/lemmatizer.py")) lemmatizer.py]
         +cell #[code LOOKUP] (dict)
         +cell
             |  Lookup-based lemmatization table. If more lemmatizer data is
@@ -189,7 +189,7 @@ p
     |  newlines, and added as a multiline string.
 
 +aside("What does spaCy consider a stop word?")
-    |  There's no particularly principal logic behind what words should be
+    |  There's no particularly principled logic behind what words should be
     |  added to the stop list. Make a list that you think might be useful
     |  to people and is likely to be unsurprising. As a rule of thumb, words
     |  that are very rare are unlikely to be useful stop words.
@@ -363,6 +363,47 @@ p
 
 +h(3, "lemmatizer") Lemmatizer
 
+p
+    |  As of v2.0, spaCy supports simple lookup-based lemmatization. This is
+    |  usually the quickest and easiest way to get started. The data is stored
+    |  in a dictionary mapping a string to its lemma. To determine a token's
+    |  lemma, spaCy simply looks it up in the table. Here's an example from
+    |  the Spanish language data:
+
++code("lang/es/lemmatizer.py (excerpt)").
+    LOOKUP = {
+        "aba": "abar",
+        "ababa": "abar",
+        "ababais": "abar",
+        "ababan": "abar",
+        "ababanes": "ababán",
+        "ababas": "abar",
+        "ababoles": "ababol",
+        "ababábites": "ababábite"
+    }
+
++aside("Where can I find lemmatizer data?")
+
+p
+    |  To add a lookup lemmatizer to your language, import the #[code LOOKUP]
+    |  table and #[code Lemmatizer], and create a new classmethod:
+
+
++code("__init__py (excerpt)").
+    # other imports here, plus lookup table and lookup lemmatizer
+    from .lemmatizer import LOOKUP
+    from ...lemmatizerlookup import Lemmatizer
+
+    class Xxxxx(Language):
+        lang = 'xx'
+
+        class Defaults(Language.Defaults):
+            # other language defaults here
+
+            @classmethod
+            def create_lemmatizer(cls, nlp=None):
+                return Lemmatizer(LOOKUP)
+
 +h(3, "tag-map") Tag map
 
 p