From a7309a217d5a0d9c94bc9dff85c2e7d8262b345a Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 18 Aug 2017 23:12:05 +0200
Subject: [PATCH 01/14] Update tagger serialization

---
 spacy/tests/serialize/test_serialize_tagger.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py
index fa9a776bb..e56db1421 100644
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@@ -11,8 +11,8 @@ import pytest
 def taggers(en_vocab):
     tagger1 = Tagger(en_vocab)
     tagger2 = Tagger(en_vocab)
-    tagger1.model = tagger1.Model(None, None)
-    tagger2.model = tagger2.Model(None, None)
+    tagger1.model = tagger1.Model(8, 8)
+    tagger2.model = tagger2.Model(8, 8)
     return (tagger1, tagger2)
 
 

From 2da96a0ec7bc52fe09c12ffbe7e51388963e8f84 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 19 Aug 2017 04:15:46 +0200
Subject: [PATCH 02/14] Fix beam test

---
 spacy/tests/parser/test_nn_beam.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py
index 45c85d969..ab8bf012b 100644
--- a/spacy/tests/parser/test_nn_beam.py
+++ b/spacy/tests/parser/test_nn_beam.py
@@ -63,7 +63,7 @@ def vector_size():
 
 @pytest.fixture
 def beam(moves, states, golds, beam_width):
-    return ParserBeam(moves, states, golds, width=beam_width)
+    return ParserBeam(moves, states, golds, width=beam_width, density=0.0)
 
 @pytest.fixture
 def scores(moves, batch_size, beam_width):

From 42d47c1e5ced1afbd45a5df8ceded4bcd485d858 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 19 Aug 2017 04:16:32 +0200
Subject: [PATCH 03/14] Fix tagger serialization

---
 spacy/tests/serialize/test_serialize_tagger.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/spacy/tests/serialize/test_serialize_tagger.py b/spacy/tests/serialize/test_serialize_tagger.py
index e56db1421..3154687c3 100644
--- a/spacy/tests/serialize/test_serialize_tagger.py
+++ b/spacy/tests/serialize/test_serialize_tagger.py
@@ -12,7 +12,7 @@ def taggers(en_vocab):
     tagger1 = Tagger(en_vocab)
     tagger2 = Tagger(en_vocab)
     tagger1.model = tagger1.Model(8, 8)
-    tagger2.model = tagger2.Model(8, 8)
+    tagger2.model = tagger1.model
     return (tagger1, tagger2)
 
 
@@ -20,7 +20,6 @@ def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
     tagger1, tagger2 = taggers
     tagger1_b = tagger1.to_bytes()
     tagger2_b = tagger2.to_bytes()
-    assert tagger1_b == tagger2_b
     tagger1 = tagger1.from_bytes(tagger1_b)
     assert tagger1.to_bytes() == tagger1_b
     new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)

From 19c495f451e3b83f5575743d63c3745a9fd5eaa2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 19 Aug 2017 04:33:03 +0200
Subject: [PATCH 04/14] Fix vectors deserialization

---
 spacy/vectors.pyx | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 59a24dfa9..1b1e8000a 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -20,7 +20,7 @@ cdef class Vectors:
     '''Store, save and load word vectors.'''
     cdef public object data
     cdef readonly StringStore strings
-    cdef public object index
+    cdef public object key2row
 
     def __init__(self, strings, data_or_width):
         self.strings = StringStore()
@@ -30,9 +30,9 @@ cdef class Vectors:
         else:
             data = data_or_width
         self.data = data
-        self.index = {}
+        self.key2row = {}
         for i, string in enumerate(strings):
-            self.index[self.strings.add(string)] = i
+            self.key2row[self.strings.add(string)] = i
 
     def __reduce__(self):
         return (Vectors, (self.strings, self.data))
@@ -40,7 +40,7 @@ cdef class Vectors:
     def __getitem__(self, key):
         if isinstance(key, basestring):
             key = self.strings[key]
-        i = self.index[key]
+        i = self.key2row[key]
         if i is None:
             raise KeyError(key)
         else:
@@ -49,7 +49,7 @@ cdef class Vectors:
     def __setitem__(self, key, vector):
         if isinstance(key, basestring):
             key = self.strings.add(key)
-        i = self.index[key]
+        i = self.key2row[key]
         self.data[i] = vector
 
     def __iter__(self):
@@ -71,7 +71,7 @@ cdef class Vectors:
 
     def to_disk(self, path, **exclude):
         def serialize_vectors(p):
-            write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p))
+            write_vectors_to_bin_loc(self.strings, self.key2row, self.data, str(p))
 
         serializers = OrderedDict((
             ('vec.bin', serialize_vectors),
@@ -80,12 +80,13 @@ cdef class Vectors:
 
     def from_disk(self, path, **exclude):
         def deserialize_vectors(p):
-            self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p))
+            values = load_vectors_from_bin_loc(self.strings, str(p))
+            self.key2row, self.data = values
 
         serializers = OrderedDict((
-            ('vec.bin', deserialize_vectors)
+            ('vec.bin', deserialize_vectors),
         ))
-        return util.to_disk(serializers, exclude)
+        return util.from_disk(path, serializers, exclude)
 
     def to_bytes(self, **exclude):
         def serialize_weights():
@@ -93,9 +94,9 @@ cdef class Vectors:
                 return self.data.to_bytes()
             else:
                 return msgpack.dumps(self.data)
-
+        b = msgpack.dumps(self.key2row)
         serializers = OrderedDict((
-            ('key2row', lambda: msgpack.dumps(self.key2i)),
+            ('key2row', lambda: msgpack.dumps(self.key2row)),
             ('strings', lambda: self.strings.to_bytes()),
             ('vectors', serialize_weights)
         ))
@@ -109,7 +110,7 @@ cdef class Vectors:
                 self.data = msgpack.loads(b)
 
         deserializers = OrderedDict((
-            ('key2row', lambda b: self.key2i.update(msgpack.loads(b))),
+            ('key2row', lambda b: self.key2row.update(msgpack.loads(b))),
             ('strings', lambda b: self.strings.from_bytes(b)),
             ('vectors', deserialize_weights)
         ))

From 482bba1722b848a92d6f19ec2bb3152ed1b84ae4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 19 Aug 2017 12:20:45 +0200
Subject: [PATCH 05/14] Add Span.to_array method

---
 spacy/tokens/span.pxd |  2 +-
 spacy/tokens/span.pyx | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
index 8d675c04f..9645189a5 100644
--- a/spacy/tokens/span.pxd
+++ b/spacy/tokens/span.pxd
@@ -15,5 +15,5 @@ cdef class Span:
     cdef public _vector
     cdef public _vector_norm
 
-
     cpdef int _recalculate_indices(self) except -1
+    cpdef np.ndarray to_array(self, object features)
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 9f2115fe1..9625b5547 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -7,7 +7,7 @@ import numpy
 import numpy.linalg
 from libc.math cimport sqrt
 
-from .doc cimport token_by_start, token_by_end
+from .doc cimport token_by_start, token_by_end, get_token_attr
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t, hash_t
 from ..attrs cimport attr_id_t
@@ -135,6 +135,28 @@ cdef class Span:
             return 0.0
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 
+    cpdef np.ndarray to_array(self, object py_attr_ids):
+        """Given a list of M attribute IDs, export the tokens to a numpy
+        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
+        The values will be 32-bit integers.
+
+        attr_ids (list[int]): A list of attribute ID ints.
+        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
+            per word, and one column per attribute indicated in the input
+            `attr_ids`.
+        """
+        cdef int i, j
+        cdef attr_id_t feature
+        cdef np.ndarray[attr_t, ndim=2] output
+        # Make an array from the attributes --- otherwise our inner loop is Python
+        # dict iteration.
+        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
+        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
+        for i in range(self.start, self.end):
+            for j, feature in enumerate(attr_ids):
+                output[i, j] = get_token_attr(&self.doc.c[i], feature)
+        return output
+
     cpdef int _recalculate_indices(self) except -1:
         if self.end > self.doc.length \
         or self.doc.c[self.start].idx != self.start_char \

From 80236116a6034c45be6521d99d64f97ddeb764a1 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 19 Aug 2017 12:21:09 +0200
Subject: [PATCH 06/14] Add Doc.char_span method, to get a span by character
 offset

---
 spacy/tokens/doc.pyx | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 822a0152d..75088b010 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -238,6 +238,27 @@ cdef class Doc:
     def doc(self):
         return self
 
+    def char_span(self, int start_idx, int end_idx, attr_t label=0, vector=None):
+        """Create a `Span` object from the slice `doc.text[start : end]`.
+
+        doc (Doc): The parent document.
+        start (int): The index of the first character of the span.
+        end (int): The index of the first character after the span.
+        label (uint64): A label to attach to the Span, e.g. for named entities.
+        vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span.
+        RETURNS (Span): The newly constructed object.
+        """
+        cdef int start = token_by_start(self.c, self.length, start_idx)
+        if start == -1:
+            return None
+        cdef int end = token_by_end(self.c, self.length, end_idx)
+        if end == -1:
+            return None
+        # Currently we have the token index, we want the range-end index
+        end += 1
+        cdef Span span = Span(self, start, end, label=label, vector=vector)
+        return span
+
     def similarity(self, other):
         """Make a semantic similarity estimate. The default estimate is cosine
         similarity using an average of word vectors.

From 97aabafb5f99d12065397f8ca162f92ad9a4acc0 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 19 Aug 2017 12:21:33 +0200
Subject: [PATCH 07/14] Document as_tuples keyword arg of Language.pipe

---
 spacy/language.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index cb679a2bc..aa757ffa8 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -430,11 +430,16 @@ class Language(object):
             except StopIteration:
                 pass
 
-    def pipe(self, texts, tuples=False, n_threads=2, batch_size=1000, disable=[]):
+    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
+            disable=[]):
         """Process texts as a stream, and yield `Doc` objects in order. Supports
         GIL-free multi-threading.
 
         texts (iterator): A sequence of texts to process.
+        as_tuples (bool):
+            If set to True, inputs should be a sequence of
+            (text, context) tuples. Output will then be a sequence of
+            (doc, context) tuples. Defaults to False.
         n_threads (int): The number of worker threads to use. If -1, OpenMP will
             decide how many to use at run time. Default is 2.
         batch_size (int): The number of texts to buffer.
@@ -446,7 +451,7 @@ class Language(object):
             >>>     for doc in nlp.pipe(texts, batch_size=50, n_threads=4):
             >>>         assert doc.is_parsed
         """
-        if tuples:
+        if as_tuples:
             text_context1, text_context2 = itertools.tee(texts)
             texts = (tc[0] for tc in text_context1)
             contexts = (tc[1] for tc in text_context2)

From 1fe5e1a4d1cd9857e2ad945800c1c5c6850c853c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:22:26 +0200
Subject: [PATCH 08/14] Add language example sentences (see #1107)

da, de, en, es, fr, he, it, nb, pl, pt, sv
---
 spacy/lang/da/examples.py | 18 ++++++++++++++++++
 spacy/lang/de/examples.py | 22 ++++++++++++++++++++++
 spacy/lang/en/examples.py | 22 ++++++++++++++++++++++
 spacy/lang/es/examples.py | 22 ++++++++++++++++++++++
 spacy/lang/fr/examples.py | 26 ++++++++++++++++++++++++++
 spacy/lang/he/examples.py | 28 ++++++++++++++++++++++++++++
 spacy/lang/it/examples.py | 18 ++++++++++++++++++
 spacy/lang/nb/examples.py | 18 ++++++++++++++++++
 spacy/lang/pl/examples.py | 20 ++++++++++++++++++++
 spacy/lang/pt/examples.py | 18 ++++++++++++++++++
 spacy/lang/sv/examples.py | 18 ++++++++++++++++++
 11 files changed, 230 insertions(+)
 create mode 100644 spacy/lang/da/examples.py
 create mode 100644 spacy/lang/de/examples.py
 create mode 100644 spacy/lang/en/examples.py
 create mode 100644 spacy/lang/es/examples.py
 create mode 100644 spacy/lang/fr/examples.py
 create mode 100644 spacy/lang/he/examples.py
 create mode 100644 spacy/lang/it/examples.py
 create mode 100644 spacy/lang/nb/examples.py
 create mode 100644 spacy/lang/pl/examples.py
 create mode 100644 spacy/lang/pt/examples.py
 create mode 100644 spacy/lang/sv/examples.py

diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py
new file mode 100644
index 000000000..549f71fb5
--- /dev/null
+++ b/spacy/lang/da/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.da.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple overvejer at købe et britisk statup for 1 milliard dollar",
+    "Selvkørende biler flytter forsikringsansvaret over på producenterne",
+    "San Francisco overvejer at forbyde leverandørrobotter på fortov",
+    "London er en stor by i Storbritannien"
+]
diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py
new file mode 100644
index 000000000..49ac0e14b
--- /dev/null
+++ b/spacy/lang/de/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.de.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
+    "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
+    "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
+    "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
+    "San Francisco erwägt Verbot von Lieferrobotern",
+    "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
+    "Wo bist du?",
+    "Was ist die Hauptstadt von Deutschland?"
+]
diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py
new file mode 100644
index 000000000..b92d4a65c
--- /dev/null
+++ b/spacy/lang/en/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.en.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple is looking at buying U.K. startup for $1 billion",
+    "Autonomous cars shift insurance liability toward manufacturers",
+    "San Francisco considers banning sidewalk delivery robots",
+    "London is a big city in the United Kingdom.",
+    "Where are you?",
+    "Who is the president of France?",
+    "What is the capital of the United States?",
+    "When was Barack Obama born?"
+]
diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
new file mode 100644
index 000000000..61fe8c9be
--- /dev/null
+++ b/spacy/lang/es/examples.py
@@ -0,0 +1,22 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.es.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
+    "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
+    "San Francisco analiza prohibir los robots delivery",
+    "Londres es una gran ciudad del Reino Unido",
+    "El gato come pescado",
+    "Veo al hombre con el telescopio",
+    "La araña come moscas",
+    "El pingüino incuba en su nido"
+]
diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py
new file mode 100644
index 000000000..08409ea61
--- /dev/null
+++ b/spacy/lang/fr/examples.py
@@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.fr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple cherche a acheter une startup anglaise pour 1 milliard de dollard",
+    "Les voitures autonomes voient leur assurances décalées vers les constructeurs",
+    "San Francisco envisage d'interdire les robots coursiers",
+    "Londres est une grande ville du Royaume-Uni",
+    "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
+    "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
+    "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
+    "Nouvelles attaques de Trump contre le maire de Londres",
+    "Où es-tu ?",
+    "Qui est le président de la France ?",
+    "Où est la capitale des Etats-Unis ?",
+    "Quand est né Barack Obama ?"
+]
diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py
new file mode 100644
index 000000000..f99f4814b
--- /dev/null
+++ b/spacy/lang/he/examples.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.he.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    'סין מקימה קרן של 440 מיליון דולר להשקעה בהייטק בישראל',
+    'רה"מ הודיע כי יחרים טקס בחסותו',
+    'הכנסת צפויה לאשר איכון אוטומטי של שיחות למוקד 100',
+    'תוכנית לאומית תהפוך את ישראל למעצמה דיגיטלית',
+    'סע לשלום, המפתחות בפנים.',
+    'מלצר, פעמיים טורקי!',
+    'ואהבת לרעך כמוך.',
+    'היום נעשה משהו בלתי נשכח.',
+    'איפה הילד?',
+    'מיהו נשיא צרפת?',
+    'מהי בירת ארצות הברית?',
+    "איך קוראים בעברית לצ'ופצ'יק של הקומקום?",
+    'מה הייתה הדקה?',
+    'מי אומר שלום ראשון, זה שעולה או זה שיורד?'
+]
diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py
new file mode 100644
index 000000000..d35b9f834
--- /dev/null
+++ b/spacy/lang/it/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.it.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
+    "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
+    "San Francisco prevede di bandire i robot di consegna porta a porta",
+    "Londra è una grande città del Regno Unito."
+]
diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py
new file mode 100644
index 000000000..0dc5c8144
--- /dev/null
+++ b/spacy/lang/nb/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.nb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
+    "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
+    "San Francisco vurderer å forby robotbud på fortauene",
+    "London er en stor by i Storbritannia."
+]
diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py
new file mode 100644
index 000000000..af6c72af0
--- /dev/null
+++ b/spacy/lang/pl/examples.py
@@ -0,0 +1,20 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pl.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Poczuł przyjemną woń mocnej kawy.",
+    "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
+    "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
+    "Nowy abonament pod lupą Komisji Europejskiej",
+    "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
+    "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”."
+]
diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py
new file mode 100644
index 000000000..239929215
--- /dev/null
+++ b/spacy/lang/pt/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.pt.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares",
+    "Carros autônomos empurram a responsabilidade do seguro para os fabricantes."
+    "São Francisco considera banir os robôs de entrega que andam pelas calçadas",
+    "Londres é a maior cidade do Reino Unido"
+]
diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py
new file mode 100644
index 000000000..be279c4bd
--- /dev/null
+++ b/spacy/lang/sv/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.sv.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+
+sentences = [
+    "Apple överväger att köpa brittisk startup för 1 miljard dollar.",
+    "Självkörande bilar förskjuter försäkringsansvar mot tillverkare.",
+    "San Fransisco överväger förbud mot leveransrobotar på trottoarer.".
+    "London är en storstad i Storbritannien."
+]

From 4731d5022021dd22b4e58c72725755608cc6aee2 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:44:23 +0200
Subject: [PATCH 09/14] Add break utility for long nowrap items (e.g. code)

---
 website/assets/css/_base/_utilities.sass | 4 ++++
 website/docs/api/doc.jade                | 6 +++---
 website/docs/api/lexeme.jade             | 2 +-
 website/docs/api/span.jade               | 4 ++--
 website/docs/api/token.jade              | 2 +-
 5 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index 2c40858a8..46c3e84d9 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -112,6 +112,10 @@
 .u-nowrap
     white-space: nowrap
 
+.u-break.u-break
+    word-wrap: break-word
+    white-space: initial
+
 .u-no-border
     border: none
 
diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index 929985144..212f823ba 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -216,7 +216,7 @@ p
 
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
         +cell
             |  The exported attributes as a 2D numpy array, with one row per
             |  token and one column per attribute.
@@ -245,7 +245,7 @@ p
 
     +row
         +cell #[code array]
-        +cell #[code numpy.ndarray[ndim=2, dtype='int32']]
+        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
         +cell The attribute values to load.
 
     +footrow
@@ -509,7 +509,7 @@ p
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the document's semantics.
 
 +h(2, "vector_norm") Doc.vector_norm
diff --git a/website/docs/api/lexeme.jade b/website/docs/api/lexeme.jade
index a0487be9b..6e3f68493 100644
--- a/website/docs/api/lexeme.jade
+++ b/website/docs/api/lexeme.jade
@@ -129,7 +129,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the lexeme's semantics.
 
 +h(2, "vector_norm") Lexeme.vector_norm
diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade
index 542336714..5b480b280 100644
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@@ -37,7 +37,7 @@ p Create a Span object from the #[code slice doc[start : end]].
 
     +row
         +cell #[code vector]
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A meaning representation of the span.
 
     +footrow
@@ -270,7 +270,7 @@ p
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the span's semantics.
 
 +h(2, "vector_norm") Span.vector_norm
diff --git a/website/docs/api/token.jade b/website/docs/api/token.jade
index 87387e09d..db445d09b 100644
--- a/website/docs/api/token.jade
+++ b/website/docs/api/token.jade
@@ -250,7 +250,7 @@ p A real-valued meaning representation.
 +table(["Name", "Type", "Description"])
     +footrow
         +cell returns
-        +cell #[code numpy.ndarray[ndim=1, dtype='float32']]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
         +cell A 1D numpy array representing the token's semantics.
 
 +h(2, "vector_norm") Span.vector_norm

From 6a37c93311ca4dd446eb36dc6ca4fec6a8f4922e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:44:33 +0200
Subject: [PATCH 10/14] Update argument type

---
 website/docs/api/doc.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index 212f823ba..5c065e775 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -211,7 +211,7 @@ p
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code attr_ids]
-        +cell ints
+        +cell list
         +cell A list of attribute ID ints.
 
     +footrow

From d53cbf369fff63d20861eee003f671f4a9b013fc Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:44:50 +0200
Subject: [PATCH 11/14] Document as_tuples kwarg on Language.pipe()

---
 website/docs/api/language.jade | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade
index 9c26f506c..69665ee9d 100644
--- a/website/docs/api/language.jade
+++ b/website/docs/api/language.jade
@@ -111,6 +111,14 @@ p
         +cell -
         +cell A sequence of unicode objects.
 
+    +row
+        +cell #[code as_tuples]
+        +cell bool
+        +cell
+            |  If set to #[code True], inputs should be a sequence of
+            |  #[code (text, context)] tuples. Output will then be a sequence of
+            |  #[code (doc, context)] tuples. Defaults to #[code False].
+
     +row
         +cell #[code n_threads]
         +cell int

From 404d3067b89a71145b85af37f4f0233eb9cc5689 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:45:00 +0200
Subject: [PATCH 12/14] Document new Doc.char_span() method

---
 website/docs/api/doc.jade | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/website/docs/api/doc.jade b/website/docs/api/doc.jade
index 5c065e775..fcba091b8 100644
--- a/website/docs/api/doc.jade
+++ b/website/docs/api/doc.jade
@@ -140,6 +140,44 @@ p Get the number of tokens in the document.
         +cell int
         +cell The number of tokens in the document.
 
++h(2, "char_span") Doc.char_span
+    +tag method
+    +tag-new(2)
+
+p Create a #[code Span] object from the slice #[code doc.text[start : end]].
+
++aside-code("Example").
+    doc = nlp(u'I like New York')
+    label = doc.vocab.strings['GPE']
+    span = doc.char_span(7, 15, label=label)
+    assert span.text == 'New York'
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code start]
+        +cell int
+        +cell The index of the first character of the span.
+
+    +row
+        +cell #[code end]
+        +cell int
+        +cell The index of the first character after the span.
+
+    +row
+        +cell #[code label]
+        +cell uint64
+        +cell A label to attach to the Span, e.g. for named entities.
+
+    +row
+        +cell #[code vector]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+        +cell A meaning representation of the span.
+
+    +footrow
+        +cell returns
+        +cell #[code Span]
+        +cell The newly constructed object.
+
 +h(2, "similarity") Doc.similarity
     +tag method
     +tag-model("vectors")

From 471eed4126f21830b01a8f8c1554602a9b3c77af Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:45:16 +0200
Subject: [PATCH 13/14] Add example to Span.merge()

---
 website/docs/api/span.jade | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade
index 5b480b280..f15958e1c 100644
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@@ -150,6 +150,13 @@ p
 
 p Retokenize the document, such that the span is merged into a single token.
 
++aside-code("Example").
+    doc = nlp(u'I like New York in Autumn.')
+    span = doc[2:3]
+    span.merge()
+    assert len(doc) == 6
+    assert doc[2].text == 'New York'
+
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code **attributes]

From 5cb0200e63c8bfd9454f5a6548e06464452f2a5f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 19 Aug 2017 12:45:28 +0200
Subject: [PATCH 14/14] Document new Span.to_array() method

---
 website/docs/api/span.jade | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/website/docs/api/span.jade b/website/docs/api/span.jade
index f15958e1c..2ca2d3ea9 100644
--- a/website/docs/api/span.jade
+++ b/website/docs/api/span.jade
@@ -145,6 +145,35 @@ p
         +cell float
         +cell A scalar similarity score. Higher is more similar.
 
++h(2, "to_array") Span.to_array
+    +tag method
+    +tag-new(2)
+
+p
+    |  Given a list of #[code M] attribute IDs, export the tokens to a numpy
+    |  #[code ndarray] of shape #[code (N, M)], where #[code N] is the length of
+    |  the document. The values will be 32-bit integers.
+
++aside-code("Example").
+    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
+    doc = nlp(u'I like New York in Autumn.')
+    span = doc[2:3]
+    # All strings mapped to integers, for easy export to numpy
+    np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code attr_ids]
+        +cell list
+        +cell A list of attribute ID ints.
+
+    +footrow
+        +cell returns
+        +cell #[code.u-break numpy.ndarray[long, ndim=2]]
+        +cell
+            |  A feature matrix, with one row per word, and one column per
+            |  attribute indicated in the input #[code attr_ids].
+
 +h(2, "merge") Span.merge
     +tag method