From e9816daa6a00d3d252595007316f5b32798a33e5 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 23:16:33 +0200
Subject: [PATCH 1/5] Add details on syntax iterators

---
 website/docs/usage/adding-languages.jade | 35 ++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index cbde248cc..12ae0c50e 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -42,6 +42,7 @@ p
         +item #[+a("#tokenizer-exceptions") Tokenizer exceptions]
         +item #[+a("#norm-exceptions") Norm exceptions]
         +item #[+a("#lex-attrs") Lexical attributes]
+        +item #[+a("#syntax-iterators") Syntax iterators]
         +item #[+a("#lemmatizer") Lemmatizer]
         +item #[+a("#tag-map") Tag map]
         +item #[+a("#morph-rules") Morph rules]
@@ -104,6 +105,13 @@ p
         +cell dict
         +cell Attribute ID mapped to function.
 
+    +row
+        +cell #[code SYNTAX_ITERATORS]
+        +cell dict
+        +cell
+            |  Iterator ID mapped to function. Currently only supports
+            |  #[code 'noun_chunks'].
+
     +row
         +cell #[code LOOKUP]
         +cell dict
@@ -449,6 +457,33 @@ p
     |  #[code lex_attr_getters.update(LEX_ATTRS)], only the new custom functions
     |  are overwritten.
 
++h(3, "syntax-iterators") Syntax iterators
+
+p
+    |  Syntax iterators are functions that compute views of a #[code Doc]
+    |  object based on its syntax. At the moment, this data is only used for
+    |  extracting
+    |  #[+a("/docs/usage/dependency-parse#noun-chunks") noun chunks], which
+    |  are available as the #[+api("doc#noun_chunks") #[code Doc.noun_chunks]]
+    |  property. Because base noun phrases work differently across languages,
+    |  the rules to compute them are part of the individual language's data. If
+    |  a language does not include a noun chunks iterator, the property won't
+    |  be available. For examples, see the existing syntax iterators:
+
++aside-code("Noun chunks example").
+    doc = nlp(u'A phrase with another phrase occurs.')
+    chunks = list(doc.noun_chunks)
+    assert chunks[0].text == "A phrase"
+    assert chunks[1].text == "another phrase"
+
++table(["Language", "Source"])
+    for lang, lang_id in {en: "English", de: "German", es: "Spanish"}
+        +row
+            +cell=lang
+            +cell
+                +src(gh("spaCy", "spacy/lang/" + lang_id + "/syntax_iterators.py"))
+                    |  lang/#{lang_id}/syntax_iterators.py
+
 +h(3, "lemmatizer") Lemmatizer
 
 p

From 47d066b2933e43376087995a8ed20bc436ac820d Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 23:17:54 +0200
Subject: [PATCH 2/5] Add under construction

---
 website/docs/usage/adding-languages.jade | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index 12ae0c50e..fac75dca4 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -639,6 +639,8 @@ p
 
 +h(2, "vocabulary") Building the vocabulary
 
++under-construction
+
 p
     |  spaCy expects that common words will be cached in a
     |  #[+api("vocab") #[code Vocab]] instance. The vocabulary caches lexical
@@ -732,6 +734,8 @@ p
 
 +h(3, "word-vectors") Training the word vectors
 
++under-construction
+
 p
     |  #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related
     |  algorithms let you train useful word similarity models from unlabelled
@@ -766,6 +770,8 @@ p
 
 +h(2, "train-tagger-parser") Training the tagger and parser
 
++under-construction
+
 p
     |  You can now train the model using a corpus for your language annotated
     |  with #[+a("http://universaldependencies.org/") Universal Dependencies].

From a857b2b511e54795a04a5a02834dcea0a3e70309 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 23:21:37 +0200
Subject: [PATCH 3/5] Update norms example

---
 website/docs/usage/adding-languages.jade | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index fac75dca4..5052eb2b7 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -350,8 +350,9 @@ p
     |  word exists, norms should always be in lowercase.
 
 +aside-code("Accessing norms").
-    doc = nlp(u"I can't")
-    assert [t.norm_ for t in doc] == ['i', 'can', 'not']
+    doc = nlp(u"I'm gonna")
+    norms = [token.norm_ for token in doc]
+    assert norms == ['i', 'am', 'going', 'to']
 
 p
     |  spaCy usually tries to normalise words with different spellings to a single,

From f8e93b6d0a346e9a53dac2e70e5f1712d40d6e1e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 23:24:29 +0200
Subject: [PATCH 4/5] Update norms example

---
 website/docs/usage/adding-languages.jade | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index 5052eb2b7..cc90db505 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -349,10 +349,12 @@ p
     |  a token's norm equals its lowercase text. If the lowercase spelling of a
     |  word exists, norms should always be in lowercase.
 
-+aside-code("Accessing norms").
++aside-code("Norms vs. lemmas").
     doc = nlp(u"I'm gonna")
     norms = [token.norm_ for token in doc]
+    lemmas = [token.lemma_ for token in doc]
     assert norms == ['i', 'am', 'going', 'to']
+    assert lemmas == ['i', 'be', 'go', 'to']
 
 p
     |  spaCy usually tries to normalise words with different spellings to a single,

From 505d43b832cb64028b043461c621b24fa6c188af Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sun, 4 Jun 2017 23:33:26 +0200
Subject: [PATCH 5/5] Update norms example

---
 website/docs/usage/adding-languages.jade | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade
index cc90db505..a0b77ad17 100644
--- a/website/docs/usage/adding-languages.jade
+++ b/website/docs/usage/adding-languages.jade
@@ -350,11 +350,11 @@ p
     |  word exists, norms should always be in lowercase.
 
 +aside-code("Norms vs. lemmas").
-    doc = nlp(u"I'm gonna")
+    doc = nlp(u"I'm gonna realise")
     norms = [token.norm_ for token in doc]
     lemmas = [token.lemma_ for token in doc]
-    assert norms == ['i', 'am', 'going', 'to']
-    assert lemmas == ['i', 'be', 'go', 'to']
+    assert norms == ['i', 'am', 'going', 'to', 'realize']
+    assert lemmas == ['i', 'be', 'go', 'to', 'realise']
 
 p
     |  spaCy usually tries to normalise words with different spellings to a single,