From 1976fb157f13df28817f334adb427ffccca1df6b Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 21:49:57 +0100
Subject: [PATCH 01/21] Update licenses

---
 website/models/_data.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/models/_data.json b/website/models/_data.json
index cb971e20c..05a96a24f 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -76,7 +76,8 @@
         "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
         "CC BY-NC":     "https://creativecommons.org/licenses/by-nc/3.0/",
         "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
-        "GPL":          "http://www.gnu.de/documents/gpl.en.html"
+        "GPL":          "https://www.gnu.org/licenses/gpl.html",
+        "LGPL":         "https://www.gnu.org/licenses/lgpl.html"
     },
 
     "MODEL_BENCHMARKS": {

From 2fa53b39d52149dc412d051bea88daf998fa47d9 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:01:06 +0100
Subject: [PATCH 02/21] Add dev dependency

---
 website/package.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/website/package.json b/website/package.json
index f0a57da4f..fe22e6787 100644
--- a/website/package.json
+++ b/website/package.json
@@ -9,7 +9,8 @@
     "babel-cli": "^6.14.0",
     "harp": "^0.24.0",
     "rollup": "^0.50.0",
-    "uglify-js": "^2.7.3"
+    "uglify-js": "^2.7.3",
+    "broken-link-checker": "^0.7.6"
   },
   "dependencies": {},
   "scripts": {

From 408f450ce019ca6a62dde9d5c539f4c468750b4c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:01:12 +0100
Subject: [PATCH 03/21] Tidy up

---
 website/assets/css/_base/_layout.sass | 1 -
 1 file changed, 1 deletion(-)

diff --git a/website/assets/css/_base/_layout.sass b/website/assets/css/_base/_layout.sass
index 1b725fdbf..64fc3808a 100644
--- a/website/assets/css/_base/_layout.sass
+++ b/website/assets/css/_base/_layout.sass
@@ -12,7 +12,6 @@ body
     animation: fadeIn 0.25s ease
     background: $color-back
     color: $color-front
-    //scroll-behavior: smooth
 
 
 //- Paragraphs

From 3af281a3346315c6320c4c81efe58c495cf5c602 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:02:00 +0100
Subject: [PATCH 04/21] Update test model name

---
 spacy/tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ee4093db3..2d1b03514 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -18,7 +18,7 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
               'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
 _models = {'en': ['en_core_web_sm'],
            'de': ['de_core_news_md'],
-           'fr': ['fr_depvec_web_lg'],
+           'fr': ['fr_core_news_sm'],
            'xx': ['xx_ent_web_md']}
 
 

From 819e30a26ef65c070676d903e6bcc52fdb04cba1 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:02:45 +0100
Subject: [PATCH 05/21] Tidy up tokenizer exceptions

---
 spacy/lang/bn/tokenizer_exceptions.py | 2 +-
 spacy/lang/da/tokenizer_exceptions.py | 3 +--
 spacy/lang/de/tokenizer_exceptions.py | 2 +-
 spacy/lang/en/tokenizer_exceptions.py | 4 ++--
 spacy/lang/es/tokenizer_exceptions.py | 2 +-
 spacy/lang/fi/tokenizer_exceptions.py | 2 +-
 spacy/lang/fr/tokenizer_exceptions.py | 2 +-
 spacy/lang/nb/tokenizer_exceptions.py | 2 +-
 spacy/lang/pl/tokenizer_exceptions.py | 2 +-
 spacy/lang/sv/tokenizer_exceptions.py | 2 +-
 spacy/lang/tokenizer_exceptions.py    | 2 +-
 11 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index 5c6de139b..dc1181335 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -20,7 +20,7 @@ for exc_data in [
     {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
     {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
     {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index e8edf36b8..c67c038bf 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -8,7 +8,6 @@ _exc = {}
 
 for exc_data in [
     {ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
-
     {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
     {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
     {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
@@ -21,7 +20,7 @@ for exc_data in [
     {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
     {ORTH: "Nov.", LEMMA: "november", NORM: "november"},
     {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 for orth in [
     "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 0b23a1001..cb16fb06c 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -164,7 +164,7 @@ for exc_data in [
     {ORTH: "z.b.", LEMMA: "zum Beispiel"},
     {ORTH: "zzgl.", LEMMA: "zuzüglich"},
     {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 for orth in [
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 0e5bbc7f6..a76b5fb2b 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -276,7 +276,7 @@ for exc_data in [
     exc_data_apos = dict(exc_data)
     exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
     for data in [exc_data, exc_data_apos]:
-        _exc[data[ORTH]] = [dict(data)]
+        _exc[data[ORTH]] = [data]
 
 
 # Times
@@ -440,7 +440,7 @@ for exc_data in [
     {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
     {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
     {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 for orth in [
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index cb62f008f..d4131ddf6 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -26,7 +26,7 @@ for exc_data in [
     {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
     {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
     {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 # Times
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index 33e223575..88859fefb 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -73,7 +73,7 @@ for exc_data in [
     {ORTH: "ts.", LEMMA: "toisin sanoen"},
     {ORTH: "vm.", LEMMA: "viimeksi mainittu"},
     {ORTH: "srk.", LEMMA: "seurakunta"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 442b367dd..9994686ac 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -54,7 +54,7 @@ for exc_data in [
     {LEMMA: "degrés", ORTH: "d°"},
     {LEMMA: "saint", ORTH: "St."},
     {LEMMA: "sainte", ORTH: "Ste."}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 for orth in FR_BASE_EXCEPTIONS + ["etc."]:
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index 1529315ca..764866732 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -11,7 +11,7 @@ for exc_data in [
     {ORTH: "jan.", LEMMA: "januar"},
     {ORTH: "feb.", LEMMA: "februar"},
     {ORTH: "jul.", LEMMA: "juli"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 for orth in [
diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
index fb87ae8a6..6098c2bb6 100644
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@@ -13,7 +13,7 @@ for exc_data in [
     {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
     {ORTH: "tj.", LEMMA: "to jest", POS: ADV},
     {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)],
+    _exc[exc_data[ORTH]] = [exc_data]
 
 for orth in [
     "w.", "r."]:
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index 0575c3892..64aedf8af 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -68,7 +68,7 @@ for exc_data in [
     {ORTH: "Sön.", LEMMA: "Söndag"},
     {ORTH: "sthlm", LEMMA: "Stockholm"},
     {ORTH: "gbg", LEMMA: "Göteborg"}]:
-    _exc[exc_data[ORTH]] = [dict(exc_data)]
+    _exc[exc_data[ORTH]] = [exc_data]
 
 
 for orth in [
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 73ad88d08..89e1b1476 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -68,7 +68,7 @@ for exc_data in [
     {ORTH: "\\n", POS: SPACE},
     {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
     {ORTH: "\u00a0", POS: SPACE, LEMMA: "  "}]:
-    BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
+    BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
 
 
 for orth in [

From 18c859500b4797677a2adde351f2047b20b161cc Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:02:51 +0100
Subject: [PATCH 06/21] Add missing imports

---
 spacy/lang/pl/tokenizer_exceptions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
index 6098c2bb6..269634671 100644
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@@ -1,7 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals
 
-from ..symbols import ORTH, LEMMA, POS
+from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
 
 
 _exc = {}

From c6fea3e5f61ec02f7df1ddcc44e7cb4ce61c7d6b Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:04:28 +0100
Subject: [PATCH 07/21] Add Romanian and Croatian skeletons (experimental)

Add language data templates to make it easier for others to contribute to the language support
---
 spacy/lang/hr/__init__.py             |  27 ++
 spacy/lang/hr/stop_words.py           | 187 +++++++++++
 spacy/lang/ro/__init__.py             |  28 ++
 spacy/lang/ro/stop_words.py           | 442 ++++++++++++++++++++++++++
 spacy/lang/ro/tokenizer_exceptions.py |  17 +
 website/models/_data.json             |   2 +
 6 files changed, 703 insertions(+)
 create mode 100644 spacy/lang/hr/__init__.py
 create mode 100644 spacy/lang/hr/stop_words.py
 create mode 100644 spacy/lang/ro/__init__.py
 create mode 100644 spacy/lang/ro/stop_words.py
 create mode 100644 spacy/lang/ro/tokenizer_exceptions.py

diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
new file mode 100644
index 000000000..61b7f38ea
--- /dev/null
+++ b/spacy/lang/hr/__init__.py
@@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class CroatianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'hr'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+    stop_words = STOP_WORDS
+
+
+class Croatian(Language):
+    lang = 'hr'
+    Defaults = CroatianDefaults
+
+
+__all__ = ['Croatian']
+
diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py
new file mode 100644
index 000000000..bf91229a0
--- /dev/null
+++ b/spacy/lang/hr/stop_words.py
@@ -0,0 +1,187 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-hr
+
+STOP_WORDS = set("""
+a
+ako
+ali
+bi
+bih
+bila
+bili
+bilo
+bio
+bismo
+biste
+biti
+bumo
+da
+do
+duž
+ga
+hoće
+hoćemo
+hoćete
+hoćeš
+hoću
+i
+iako
+ih
+ili
+iz
+ja
+je
+jedna
+jedne
+jedno
+jer
+jesam
+jesi
+jesmo
+jest
+jeste
+jesu
+jim
+joj
+još
+ju
+kada
+kako
+kao
+koja
+koje
+koji
+kojima
+koju
+kroz
+li
+me
+mene
+meni
+mi
+mimo
+moj
+moja
+moje
+mu
+na
+nad
+nakon
+nam
+nama
+nas
+naš
+naša
+naše
+našeg
+ne
+nego
+neka
+neki
+nekog
+neku
+nema
+netko
+neće
+nećemo
+nećete
+nećeš
+neću
+nešto
+ni
+nije
+nikoga
+nikoje
+nikoju
+nisam
+nisi
+nismo
+niste
+nisu
+njega
+njegov
+njegova
+njegovo
+njemu
+njezin
+njezina
+njezino
+njih
+njihov
+njihova
+njihovo
+njim
+njima
+njoj
+nju
+no
+o
+od
+odmah
+on
+ona
+oni
+ono
+ova
+pa
+pak
+po
+pod
+pored
+prije
+s
+sa
+sam
+samo
+se
+sebe
+sebi
+si
+smo
+ste
+su
+sve
+svi
+svog
+svoj
+svoja
+svoje
+svom
+ta
+tada
+taj
+tako
+te
+tebe
+tebi
+ti
+to
+toj
+tome
+tu
+tvoj
+tvoja
+tvoje
+u
+uz
+vam
+vama
+vas
+vaš
+vaša
+vaše
+već
+vi
+vrlo
+za
+zar
+će
+ćemo
+ćete
+ćeš
+ću
+što
+""".split())
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
new file mode 100644
index 000000000..e66fad691
--- /dev/null
+++ b/spacy/lang/ro/__init__.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class RomanianDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'ro'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = STOP_WORDS
+
+
+class Romanian(Language):
+    lang = 'ro'
+    Defaults = RomanianDefaults
+
+
+__all__ = ['Romanian']
+
diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py
new file mode 100644
index 000000000..ffaaea7c1
--- /dev/null
+++ b/spacy/lang/ro/stop_words.py
@@ -0,0 +1,442 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-ro
+
+STOP_WORDS = set("""
+a
+abia
+acea
+aceasta
+această
+aceea
+aceeasi
+acei
+aceia
+acel
+acela
+acelasi
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+acestei
+acestia
+acestui
+aceşti
+aceştia
+acolo
+acord
+acum
+adica
+ai
+aia
+aibă
+aici
+aiurea
+al
+ala
+alaturi
+ale
+alea
+alt
+alta
+altceva
+altcineva
+alte
+altfel
+alti
+altii
+altul
+am
+anume
+apoi
+ar
+are
+as
+asa
+asemenea
+asta
+astazi
+astea
+astfel
+astăzi
+asupra
+atare
+atat
+atata
+atatea
+atatia
+ati
+atit
+atita
+atitea
+atitia
+atunci
+au
+avea
+avem
+aveţi
+avut
+azi
+aş
+aşadar
+aţi
+b
+ba
+bine
+bucur
+bună
+c
+ca
+cam
+cand
+capat
+care
+careia
+carora
+caruia
+cat
+catre
+caut
+ce
+cea
+ceea
+cei
+ceilalti
+cel
+cele
+celor
+ceva
+chiar
+ci
+cinci
+cind
+cine
+cineva
+cit
+cita
+cite
+citeva
+citi
+citiva
+conform
+contra
+cu
+cui
+cum
+cumva
+curând
+curînd
+când
+cât
+câte
+câtva
+câţi
+cînd
+cît
+cîte
+cîtva
+cîţi
+că
+căci
+cărei
+căror
+cărui
+către
+d
+da
+daca
+dacă
+dar
+dat
+datorită
+dată
+dau
+de
+deasupra
+deci
+decit
+degraba
+deja
+deoarece
+departe
+desi
+despre
+deşi
+din
+dinaintea
+dintr
+dintr-
+dintre
+doar
+doi
+doilea
+două
+drept
+dupa
+după
+dă
+e
+ea
+ei
+el
+ele
+era
+eram
+este
+eu
+exact
+eşti
+f
+face
+fara
+fata
+fel
+fi
+fie
+fiecare
+fii
+fim
+fiu
+fiţi
+foarte
+fost
+frumos
+fără
+g
+geaba
+graţie
+h
+halbă
+i
+ia
+iar
+ieri
+ii
+il
+imi
+in
+inainte
+inapoi
+inca
+incit
+insa
+intr
+intre
+isi
+iti
+j
+k
+l
+la
+le
+li
+lor
+lui
+lângă
+lîngă
+m
+ma
+mai
+mare
+mea
+mei
+mele
+mereu
+meu
+mi
+mie
+mine
+mod
+mult
+multa
+multe
+multi
+multă
+mulţi
+mulţumesc
+mâine
+mîine
+mă
+n
+ne
+nevoie
+ni
+nici
+niciodata
+nicăieri
+nimeni
+nimeri
+nimic
+niste
+nişte
+noastre
+noastră
+noi
+noroc
+nostri
+nostru
+nou
+noua
+nouă
+noştri
+nu
+numai
+o
+opt
+or
+ori
+oricare
+orice
+oricine
+oricum
+oricând
+oricât
+oricînd
+oricît
+oriunde
+p
+pai
+parca
+patra
+patru
+patrulea
+pe
+pentru
+peste
+pic
+pina
+plus
+poate
+pot
+prea
+prima
+primul
+prin
+printr-
+putini
+puţin
+puţina
+puţină
+până
+pînă
+r
+rog
+s
+sa
+sa-mi
+sa-ti
+sai
+sale
+sau
+se
+si
+sint
+sintem
+spate
+spre
+sub
+sunt
+suntem
+sunteţi
+sus
+sută
+sînt
+sîntem
+sînteţi
+să
+săi
+său
+t
+ta
+tale
+te
+ti
+timp
+tine
+toata
+toate
+toată
+tocmai
+tot
+toti
+totul
+totusi
+totuşi
+toţi
+trei
+treia
+treilea
+tu
+tuturor
+tăi
+tău
+u
+ul
+ului
+un
+una
+unde
+undeva
+unei
+uneia
+unele
+uneori
+unii
+unor
+unora
+unu
+unui
+unuia
+unul
+v
+va
+vi
+voastre
+voastră
+voi
+vom
+vor
+vostru
+vouă
+voştri
+vreme
+vreo
+vreun
+vă
+x
+z
+zece
+zero
+zi
+zice
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încotro
+încât
+încît
+între
+întrucât
+întrucît
+îţi
+ăla
+ălea
+ăsta
+ăstea
+ăştia
+şapte
+şase
+şi
+ştiu
+ţi
+ţie
+""".split())
diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py
new file mode 100644
index 000000000..42ccd6a93
--- /dev/null
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH
+
+
+_exc = {}
+
+
+# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
+for orth in [
+    "1-a", "1-ul", "10-a", "10-lea", "2-a", "3-a", "3-lea", "6-lea",
+    "d-voastră", "dvs.", "Rom.", "str."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/website/models/_data.json b/website/models/_data.json
index 05a96a24f..dbbd125d9 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -100,6 +100,8 @@
         "da": "Danish",
         "hu": "Hungarian",
         "pl": "Polish",
+        "ro": "Romanian",
+        "hr": "Croatian",
         "he": "Hebrew",
         "ga": "Irish",
         "bn": "Bengali",

From 391fce09d9938aaa442e2d51f462ab744c545076 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Wed, 1 Nov 2017 23:04:40 +0100
Subject: [PATCH 08/21] Update licenses

---
 website/models/_data.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/models/_data.json b/website/models/_data.json
index dbbd125d9..62f21dd6f 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -74,6 +74,7 @@
     "MODEL_LICENSES": {
         "CC BY-SA":     "https://creativecommons.org/licenses/by-sa/3.0/",
         "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
+        "CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
         "CC BY-NC":     "https://creativecommons.org/licenses/by-nc/3.0/",
         "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
         "GPL":          "https://www.gnu.org/licenses/gpl.html",

From 15cbc61a6e97162b93dadd680f7a2a5c2d7c7be0 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 2 Nov 2017 16:13:18 +0100
Subject: [PATCH 09/21] Adjust rendering of large numbers

1234 -> 1.2k
12345 -> 12.3k
123456 -> 123k
1234567 -> 1.2m
---
 website/assets/js/util.js | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/website/assets/js/util.js b/website/assets/js/util.js
index 65d05774c..90e0b5994 100644
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
  * @param {number|string} num - The number to convert.
  * @param {number} fixed - Number of decimals.
  */
-export const abbrNumber = (num = 0, fixed = 2) => {
+export const abbrNumber = (num = 0, fixed = 1) => {
     const suffixes = ['', 'k', 'm', 'b', 't'];
     if (num === null || num === 0) return 0;
     const b = num.toPrecision(2).split('e');
     const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
-    const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+    const n = (k < 1) ? num : num / Math.pow(10, k * 3);
+    const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
     return (c < 0 ? c : Math.abs(c)) + suffixes[k];
 }

From 31e349a62c06069fd611bd96da88918fe2484180 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 2 Nov 2017 16:13:38 +0100
Subject: [PATCH 10/21] Update model families

---
 website/models/_data.json | 14 ++++++++------
 website/models/nl.jade    |  6 ++++++
 2 files changed, 14 insertions(+), 6 deletions(-)
 create mode 100644 website/models/nl.jade

diff --git a/website/models/_data.json b/website/models/_data.json
index 62f21dd6f..71c2bf7d0 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -12,6 +12,7 @@
             "Portuguese": "pt",
             "French": "fr",
             "Italian": "it",
+            "Dutch": "nl",
             "Multi-Language": "xx"
         }
     },
@@ -39,12 +40,13 @@
     },
 
     "MODELS": {
-        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
-        "de": ["de_dep_news_sm"],
-        "es": ["es_core_web_sm"],
-        "pt": [],
-        "fr": [],
-        "it": [],
+        "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
+        "de": ["de_core_news_sm", "de_core_news_md"],
+        "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
+        "pt": ["pt_core_news_sm"],
+        "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
+        "it": ["it_core_news_sm"],
+        "nl": ["nl_core_news_sm"],
         "xx": ["xx_ent_wiki_sm"]
     },
 
diff --git a/website/models/nl.jade b/website/models/nl.jade
new file mode 100644
index 000000000..081b4a712
--- /dev/null
+++ b/website/models/nl.jade
@@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > NL
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.

From 9baab241b487d93066c3f68e63acec22b013633f Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 2 Nov 2017 16:32:24 +0100
Subject: [PATCH 11/21] Add skeleton language data for Turkish

---
 spacy/lang/tr/__init__.py             |  28 ++
 spacy/lang/tr/stop_words.py           | 512 ++++++++++++++++++++++++++
 spacy/lang/tr/tokenizer_exceptions.py |  27 ++
 website/models/_data.json             |   1 +
 4 files changed, 568 insertions(+)
 create mode 100644 spacy/lang/tr/__init__.py
 create mode 100644 spacy/lang/tr/stop_words.py
 create mode 100644 spacy/lang/tr/tokenizer_exceptions.py

diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
new file mode 100644
index 000000000..d1cd04f42
--- /dev/null
+++ b/spacy/lang/tr/__init__.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class TurkishDefaults(Language.Defaults):
+    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+    lex_attr_getters[LANG] = lambda text: 'tr'
+    lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+    tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+    stop_words = STOP_WORDS
+
+
+class Turkish(Language):
+    lang = 'tr'
+    Defaults = TurkishDefaults
+
+
+__all__ = ['Turkish']
+
diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py
new file mode 100644
index 000000000..aaed02a3e
--- /dev/null
+++ b/spacy/lang/tr/stop_words.py
@@ -0,0 +1,512 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-tr
+
+STOP_WORDS = set("""
+acaba
+acep
+adamakıllı
+adeta
+ait
+altmýþ
+altmış
+altý
+altı
+ama
+amma
+anca
+ancak
+arada
+artýk
+aslında
+aynen
+ayrıca
+az
+açıkça
+açıkçası
+bana
+bari
+bazen
+bazý
+bazı
+başkası
+baţka
+belki
+ben
+benden
+beni
+benim
+beri
+beriki
+beþ
+beş
+beţ
+bilcümle
+bile
+bin
+binaen
+binaenaleyh
+bir
+biraz
+birazdan
+birbiri
+birden
+birdenbire
+biri
+birice
+birileri
+birisi
+birkaç
+birkaçı
+birkez
+birlikte
+birçok
+birçoğu
+birþey
+birþeyi
+birşey
+birşeyi
+birţey
+bitevi
+biteviye
+bittabi
+biz
+bizatihi
+bizce
+bizcileyin
+bizden
+bize
+bizi
+bizim
+bizimki
+bizzat
+boşuna
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+buracıkta
+burada
+buradan
+burası
+böyle
+böylece
+böylecene
+böylelikle
+böylemesine
+böylesine
+büsbütün
+bütün
+cuk
+cümlesi
+da
+daha
+dahi
+dahil
+dahilen
+daima
+dair
+dayanarak
+de
+defa
+dek
+demin
+demincek
+deminden
+denli
+derakap
+derhal
+derken
+deđil
+değil
+değin
+diye
+diđer
+diğer
+diğeri
+doksan
+dokuz
+dolayı
+dolayısıyla
+doğru
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+elbet
+elbette
+elli
+emme
+en
+enikonu
+epey
+epeyce
+epeyi
+esasen
+esnasında
+etmesi
+etraflı
+etraflıca
+etti
+ettiği
+ettiğini
+evleviyetle
+evvel
+evvela
+evvelce
+evvelden
+evvelemirde
+evveli
+eđer
+eğer
+fakat
+filanca
+gah
+gayet
+gayetle
+gayri
+gayrı
+gelgelelim
+gene
+gerek
+gerçi
+geçende
+geçenlerde
+gibi
+gibilerden
+gibisinden
+gine
+göre
+gırla
+hakeza
+halbuki
+halen
+halihazırda
+haliyle
+handiyse
+hangi
+hangisi
+hani
+hariç
+hasebiyle
+hasılı
+hatta
+hele
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkes
+herkesin
+hiç
+hiçbir
+hiçbiri
+hoş
+hulasaten
+iken
+iki
+ila
+ile
+ilen
+ilgili
+ilk
+illa
+illaki
+imdi
+indinde
+inen
+insermi
+ise
+ister
+itibaren
+itibariyle
+itibarıyla
+iyi
+iyice
+iyicene
+için
+iş
+işte
+iţte
+kadar
+kaffesi
+kah
+kala
+kanýmca
+karşın
+katrilyon
+kaynak
+kaçı
+kelli
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kere
+kez
+keza
+kezalik
+keşke
+keţke
+ki
+kim
+kimden
+kime
+kimi
+kimisi
+kimse
+kimsecik
+kimsecikler
+külliyen
+kýrk
+kýsaca
+kırk
+kısaca
+lakin
+leh
+lütfen
+maada
+madem
+mademki
+mamafih
+mebni
+međer
+meğer
+meğerki
+meğerse
+milyar
+milyon
+mu
+mü
+mý
+mı
+nasýl
+nasıl
+nasılsa
+nazaran
+naşi
+ne
+neden
+nedeniyle
+nedenle
+nedense
+nerde
+nerden
+nerdeyse
+nere
+nerede
+nereden
+neredeyse
+neresi
+nereye
+netekim
+neye
+neyi
+neyse
+nice
+nihayet
+nihayetinde
+nitekim
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduklarını
+oldukça
+olduğu
+olduğunu
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+onca
+onculayın
+onda
+ondan
+onlar
+onlardan
+onlari
+onlarýn
+onları
+onların
+onu
+onun
+oracık
+oracıkta
+orada
+oradan
+oranca
+oranla
+oraya
+otuz
+oysa
+oysaki
+pek
+pekala
+peki
+pekçe
+peyderpey
+rağmen
+sadece
+sahi
+sahiden
+sana
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+sonra
+sonradan
+sonraları
+sonunda
+tabii
+tam
+tamam
+tamamen
+tamamıyla
+tarafından
+tek
+trilyon
+tüm
+var
+vardı
+vasıtasıyla
+ve
+velev
+velhasıl
+velhasılıkelam
+veya
+veyahut
+ya
+yahut
+yakinen
+yakında
+yakından
+yakınlarda
+yalnız
+yalnızca
+yani
+yapacak
+yapmak
+yaptı
+yaptıkları
+yaptığı
+yaptığını
+yapılan
+yapılması
+yapıyor
+yedi
+yeniden
+yenilerde
+yerine
+yetmiþ
+yetmiş
+yetmiţ
+yine
+yirmi
+yok
+yoksa
+yoluyla
+yüz
+yüzünden
+zarfında
+zaten
+zati
+zira
+çabuk
+çabukça
+çeşitli
+çok
+çokları
+çoklarınca
+çokluk
+çoklukla
+çokça
+çoğu
+çoğun
+çoğunca
+çoğunlukla
+çünkü
+öbür
+öbürkü
+öbürü
+önce
+önceden
+önceleri
+öncelikle
+öteki
+ötekisi
+öyle
+öylece
+öylelikle
+öylemesine
+öz
+üzere
+üç
+þey
+þeyden
+þeyi
+þeyler
+þu
+þuna
+þunda
+þundan
+þunu
+şayet
+şey
+şeyden
+şeyi
+şeyler
+şu
+şuna
+şuncacık
+şunda
+şundan
+şunlar
+şunları
+şunu
+şunun
+şura
+şuracık
+şuracıkta
+şurası
+şöyle
+ţayet
+ţimdi
+ţu
+ţöyle
+""".split())
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
new file mode 100644
index 000000000..c945c0058
--- /dev/null
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, NORM
+
+
+# These exceptions are mostly for example purposes – hoping that Turkish
+# speakers can contribute in the future! Source of copy-pasted examples:
+# https://en.wiktionary.org/wiki/Category:Turkish_language
+
+_exc = {
+    "sağol": [
+        {ORTH: "sağ"},
+        {ORTH: "ol", NORM: "olun"}]
+}
+
+
+for exc_data in [
+    {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
+    _exc[exc_data[ORTH]] = [exc_data]
+
+
+for orth in ["Dr."]:
+    _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/website/models/_data.json b/website/models/_data.json
index 71c2bf7d0..d64c94074 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -105,6 +105,7 @@
         "pl": "Polish",
         "ro": "Romanian",
         "hr": "Croatian",
+        "tr": "Turkish",
         "he": "Hebrew",
         "ga": "Irish",
         "bn": "Bengali",

From 43512c68b25e6ff3fc52c46e09fda999511d622d Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Thu, 2 Nov 2017 20:04:13 +0100
Subject: [PATCH 12/21] Fix vector details in model overview

---
 website/_includes/_page_models.jade           |  2 ++
 website/assets/js/models.js                   | 32 +++++++++++++------
 website/models/_data.json                     |  1 +
 website/models/comparison.jade                |  2 ++
 website/usage/_spacy-101/_word-vectors.jade   |  6 ++--
 .../usage/_vectors-similarity/_basics.jade    |  9 ++----
 6 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade
index 1cab930fb..c7742fa38 100644
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@@ -40,6 +40,8 @@ for id in CURRENT_MODELS
 
             each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
                 - var field = label.toLowerCase()
+                if field == "vectors"
+                    - field = "vecs"
                 +row
                     +cell.u-nowrap
                         +label=label
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index 2d371ee1f..f5757c8cb 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -20,21 +20,33 @@ const CHART_FONTS = {
  * @property {function} vectors - Format vector data (entries and dimensions).
  * @property {function} version - Format model version number.
  */
-export const formats = {
+const formats = {
     author: (author, url) => url ? `<a href="${url}" target="_blank">${author}</a>` : author,
     license: (license, url) => url ? `<a href="${url}" target="_blank">${license}</a>` : license,
     sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
     pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `<code>${p}</code>`).join(', ') : '-',
-    vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
+    vectors: vec => formatVectors(vec),
     version: version => `<code>v${version}</code>`
 };
 
+/**
+ * Format word vectors data depending on contents.
+ * @property {Object} data - The vectors object from the model's meta.json.
+ */
+const formatVectors = data => {
+    if (!data) return 'n/a';
+    if (Object.values(data).every(n => n == 0)) return 'context vectors only';
+    const { keys, vectors: vecs, width } = data;
+    return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
+}
+
+
 /**
  * Find the latest version of a model in a compatibility table.
  * @param {string} model - The model name.
  * @param {Object} compat - Compatibility table, keyed by spaCy version.
  */
-export const getLatestVersion = (model, compat = {}) => {
+const getLatestVersion = (model, compat = {}) => {
     for (let [spacy_v, models] of Object.entries(compat)) {
         if (models[model]) return models[model][0];
     }
@@ -90,7 +102,7 @@ export class ModelLoader {
         const tpl = new Templater(modelId);
         tpl.get('table').removeAttribute('data-loading');
         tpl.get('error').style.display = 'block';
-        for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+        for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
             tpl.get(key).parentElement.parentElement.style.display = 'none';
         }
     }
@@ -120,8 +132,8 @@ export class ModelLoader {
         if (author) tpl.fill('author', formats.author(author, url), true);
         if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
         if (sources) tpl.fill('sources', formats.sources(sources));
-        if (vectors) tpl.fill('vectors', formats.vectors(vectors));
-        else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+        if (vectors) tpl.fill('vecs', formats.vectors(vectors));
+        else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
         if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
         else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
     }
@@ -223,8 +235,9 @@ export class ModelComparer {
         const version = getLatestVersion(name, this.compat);
         const modelName = `${name}-${version}`;
         return new Promise((resolve, reject) => {
+            if (!version) reject();
             // resolve immediately if model already loaded, e.g. in this.models
-            if (this.models[name]) resolve(this.models[name]);
+            else if (this.models[name]) resolve(this.models[name]);
             else fetch(`${this.url}/meta/${modelName}.json`)
                 .then(res => handleResponse(res))
                 .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@@ -306,12 +319,13 @@ export class ModelComparer {
         this.tpl.fill(`size${i}`, size);
         this.tpl.fill(`desc${i}`, description || 'n/a');
         this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
-        this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+        this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
         this.tpl.fill(`sources${i}`, formats.sources(sources));
         this.tpl.fill(`author${i}`, formats.author(author, url), true);
         this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
         // check if model accuracy or speed includes one of the pre-set keys
-        for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+        const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
+        for (let key of allKeys) {
             if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
             else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
             else this.tpl.fill(`${key}${i}`, 'n/a')
diff --git a/website/models/_data.json b/website/models/_data.json
index d64c94074..8507a3fa1 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -68,6 +68,7 @@
         "gpu": "words per second on GPU",
         "pipeline": "Processing pipeline components in order",
         "sources": "Sources of training data",
+        "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
         "benchmark_parser": "Parser accuracy",
         "benchmark_ner": "NER accuracy",
         "benchmark_speed": "Speed"
diff --git a/website/models/comparison.jade b/website/models/comparison.jade
index 881a9aff4..b0ab61efe 100644
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")
 
         for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
             - var field = label.toLowerCase()
+            if field == "vectors"
+                - field = "vecs"
             +row
                 +cell.u-nowrap
                     +label=label
diff --git a/website/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade
index bb9add8a6..c38360014 100644
--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@@ -4,9 +4,9 @@ p
     |  Similarity is determined by comparing #[strong word vectors] or "word
     |  embeddings", multi-dimensional meaning representations of a word. Word
     |  vectors can be generated using an algorithm like
-    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
-    |  #[+a("/models") default models] come with
-    |  #[strong 300-dimensional vectors] that look like this:
+    |  #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
+    |  #[code md] and large #[code lg] #[+a("/models") models] come with
+    |  #[strong multi-dimensional vectors] that look like this:
 
 +code("banana.vector", false, false, 250).
     array([2.02280000e-01,  -7.66180009e-02,   3.70319992e-01,
diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade
index 07ad6bcd4..734495c6e 100644
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@@ -4,12 +4,9 @@
     |  Dense, real valued vectors representing distributional similarity
     |  information are now a cornerstone of practical NLP. The most common way
     |  to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
-    |  family of algorithms. The default
-    |  #[+a("/models/en") English model] installs
-    |  300-dimensional vectors trained on the
-    |  #[+a("http://commoncrawl.org") Common Crawl] corpus.
-    |  If you need to train a word2vec model, we recommend the implementation in
-    |  the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+    |  family of algorithms. If you need to train a word2vec model, we recommend
+    |  the implementation in the Python library
+    |  #[+a("https://radimrehurek.com/gensim/") Gensim].
 
 include ../_spacy-101/_similarity
 include ../_spacy-101/_word-vectors

From a22f96c3f19388b5369bf592aac1b056855009f8 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 3 Nov 2017 00:48:54 +0100
Subject: [PATCH 13/21] Add test for backpropagating padding

---
 spacy/tests/test_misc.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 762ea4c08..5c69dae3e 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -6,6 +6,7 @@ from .. import util
 from ..displacy import parse_deps, parse_ents
 from ..tokens import Span
 from .util import get_doc
+from .._ml import PrecomputableAffine
 
 from pathlib import Path
 import pytest
@@ -59,3 +60,19 @@ def test_displacy_parse_deps(en_vocab):
     assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
                             {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
                             {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
+
+
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+    model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
+    assert model.W.shape == (nF, nO, nP, nI)
+    tensor = model.ops.allocate((10, nI))
+    Y, get_dX = model.begin_update(tensor)
+    assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
+    assert model.d_pad.shape == (1, nF, nO, nP)
+    dY = model.ops.allocate((15, nF, nO, nP))
+    ids = model.ops.allocate((15, nF))
+    ids[1,2] = -1
+    dY[1,2] = 1
+    assert model.d_pad[0, 2, 0, 0] == 0.
+    model._backprop_padding(dY, ids)
+    assert model.d_pad[0, 2, 0, 0] == 1.

From 260e6ee3fbc829b04b269f9960e6cb676d0c33ff Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 3 Nov 2017 00:49:11 +0100
Subject: [PATCH 14/21] Improve efficiency of backprop of padding variable

---
 spacy/_ml.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 8c98567fc..4829631f4 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -150,10 +150,8 @@ class PrecomputableAffine(Model):
 
     def _backprop_padding(self, dY, ids):
         # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        for i in range(ids.shape[0]):
-            for j in range(ids.shape[1]):
-                if ids[i,j] < 0:
-                    self.d_pad[0,j] += dY[i, j]
+        d_pad = dY * (ids.reshape((ids.shape[0], self.nF, 1, 1)) < 0.)
+        self.d_pad += d_pad.sum(axis=0)
         return dY, ids
 
     @staticmethod

From 6771780d3f78ab3463fa7255516334059ed2721d Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 3 Nov 2017 01:54:34 +0100
Subject: [PATCH 15/21] Fix backprop of padding variable

---
 spacy/_ml.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 4829631f4..0f9202603 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -150,7 +150,9 @@ class PrecomputableAffine(Model):
 
     def _backprop_padding(self, dY, ids):
         # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
-        d_pad = dY * (ids.reshape((ids.shape[0], self.nF, 1, 1)) < 0.)
+        mask = ids < 0.
+        mask = mask.sum(axis=1)
+        d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
         self.d_pad += d_pad.sum(axis=0)
         return dY, ids
 

From c2bbf076a462326409cb6491752323985332422f Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Fri, 3 Nov 2017 01:54:54 +0100
Subject: [PATCH 16/21] Add document length cap for training

---
 spacy/cli/train.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index f489ba7bf..6697ed6c0 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -85,6 +85,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
     batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                    util.env_opt('batch_to', 16),
                                    util.env_opt('batch_compound', 1.001))
+    max_doc_len = util.env_opt('max_doc_len', 5000)
     corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
     n_train_words = corpus.count_train()
 
@@ -108,6 +109,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
             with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
                 losses = {}
                 for batch in minibatch(train_docs, size=batch_sizes):
+                    batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
+                    if not batch:
+                        continue
                     docs, golds = zip(*batch)
                     nlp.update(docs, golds, sgd=optimizer,
                                drop=next(dropout_rates), losses=losses)

From d0f88af5b620f25c17c369e7c0bd5ee1b79359fc Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 3 Nov 2017 11:29:04 +0100
Subject: [PATCH 17/21] Hide error earlier

---
 website/assets/js/models.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index f5757c8cb..134a0e66c 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -198,6 +198,7 @@ export class ModelComparer {
         this.fonts = CHART_FONTS;
         this.defaultModels = defaultModels;
         this.tpl.get('result').style.display = 'block';
+        this.tpl.get('error').style.display = 'none';
         this.fetchCompat()
             .then(compat => this.init(compat))
             .catch(this.showError.bind(this))

From a62b0727d8236ced40a9b98b18914de97fcf9e22 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 3 Nov 2017 11:29:21 +0100
Subject: [PATCH 18/21] Tidy up and always use bundle in built site for now

Just to be safe
---
 website/_includes/_scripts.jade | 76 ++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 40 deletions(-)

diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
index 05a468076..0be2e2e98 100644
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@@ -13,7 +13,6 @@ script(src="/assets/js/vendor/prism.min.js")
 
 if SECTION == "models"
     script(src="/assets/js/vendor/chart.min.js")
-    script(src="/assets/js/models.js?v#{V_JS}" type="module")
 
 script
     if quickstart
@@ -24,15 +23,15 @@ script
         | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
         | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
 
-
-if IS_PAGE
-    script
+    if IS_PAGE
         | ((window.gitter = {}).chat = {}).options = {
         |     useStyles: false,
         |     activationElement: '.js-gitter-button',
         |     targetElement: '.js-gitter',
         |     room: '!{SOCIAL.gitter}'
         | };
+
+if IS_PAGE
     script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
 
 
@@ -48,39 +47,36 @@ if IS_PAGE
 - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
 - ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
 
-//- Browsers with JS module support.
-    Will be ignored otherwise.
-
-script(type="module")
-    | import ProgressBar from '/assets/js/progress.js';
-    !=ProgressBar
-    if changelog
-        | import Changelog from '/assets/js/changelog.js';
-        !=Changelog
-    if IS_PAGE
-        | import NavHighlighter from '/assets/js/nav-highlighter.js';
-        !=NavHighlighter
-        | import GitHubEmbed from '/assets/js/github-embed.js';
-        !=GitHubEmbed
-    if HAS_MODELS
-        | import { ModelLoader } from '/assets/js/models.js';
-        !=ModelLoader
-    if compare_models
-        | import { ModelComparer } from '/assets/js/models.js';
-        !=ModelComparer
-
-//- Browsers with no JS module support.
-    Won't be fetched or interpreted otherwise.
-
-script(nomodule src="/assets/js/rollup.js")
-script(nomodule)
-    !=ProgressBar
-    if changelog
-        !=Changelog
-    if IS_PAGE
-        !=NavHighlighter
-        !=GitHubEmbed
-    if HAS_MODELS
-        !=ModeLoader
-    if compare_models
-        !=ModelComparer
+if environment == "deploy"
+    //- DEPLOY: use compiled rollup.js and instantiate classes directly
+    script(src="/assets/js/rollup.js")
+    script
+        !=ProgressBar
+        if changelog
+            !=Changelog
+        if IS_PAGE
+            !=NavHighlighter
+            !=GitHubEmbed
+        if HAS_MODELS
+            !=ModeLoader
+        if compare_models
+            !=ModelComparer
+else
+    //- DEVELOPMENT: Use ES6 modules
+    script(type="module")
+        | import ProgressBar from '/assets/js/progress.js';
+        !=ProgressBar
+        if changelog
+            | import Changelog from '/assets/js/changelog.js';
+            !=Changelog
+        if IS_PAGE
+            | import NavHighlighter from '/assets/js/nav-highlighter.js';
+            !=NavHighlighter
+            | import GitHubEmbed from '/assets/js/github-embed.js';
+            !=GitHubEmbed
+        if HAS_MODELS
+            | import { ModelLoader } from '/assets/js/models.js';
+            !=ModelLoader
+        if compare_models
+            | import { ModelComparer } from '/assets/js/models.js';
+            !=ModelComparer

From 1e163746871c94db7eeef5e5213538883e98820e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Fri, 3 Nov 2017 11:29:34 +0100
Subject: [PATCH 19/21] Update models list to reflect spaCy v2.0.0a18

---
 website/models/_data.json | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/website/models/_data.json b/website/models/_data.json
index 8507a3fa1..c63101ad0 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -40,13 +40,10 @@
     },
 
     "MODELS": {
-        "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
-        "de": ["de_core_news_sm", "de_core_news_md"],
-        "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
-        "pt": ["pt_core_news_sm"],
-        "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
+        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
+        "de": ["de_core_news_sm"],
+        "es": ["es_core_news_sm", "es_core_news_md"],
         "it": ["it_core_news_sm"],
-        "nl": ["nl_core_news_sm"],
         "xx": ["xx_ent_wiki_sm"]
     },
 

From c740277f9fb7687baa2a8d6a794d9f59b97ca6fb Mon Sep 17 00:00:00 2001
From: Abhinav Sharma <abhi18av@users.noreply.github.com>
Date: Fri, 3 Nov 2017 16:30:44 +0530
Subject: [PATCH 20/21] Minor typo [ nad => and ]

---
 website/usage/_adding-languages/_language-data.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/usage/_adding-languages/_language-data.jade b/website/usage/_adding-languages/_language-data.jade
index dc86b7a03..f0b346886 100644
--- a/website/usage/_adding-languages/_language-data.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@@ -218,7 +218,7 @@ p
     |  If an exception consists of more than one token, the #[code ORTH] values
     |  combined always need to #[strong match the original string]. The way the
     |  original string is split up can be pretty arbitrary sometimes – for
-    |  example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
+    |  example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
     |  Because of how the tokenizer works, it's currently not possible to split
     |  single-letter strings into multiple tokens.
 

From 2aaf5315f31e146985bffccf753f06610b305c35 Mon Sep 17 00:00:00 2001
From: Abhinav Sharma <abhi18av@users.noreply.github.com>
Date: Fri, 3 Nov 2017 16:56:58 +0530
Subject: [PATCH 21/21] Filled the details of the contribution license

---
 .github/CONTRIBUTOR_AGREEMENT.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md
index f34603065..919fb81fc 100644
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:
 
-    * [ ] I am signing on behalf of myself as an individual and no other person
+    * [x] I am signing on behalf of myself as an individual and no other person
     or entity, including my employer, has or will have rights with respect to my
     contributions.
 
@@ -96,11 +96,11 @@ mark both statements:
 
 ## Contributor Details
 
-| Field                          | Entry                |
-|------------------------------- | -------------------- |
-| Name                           |                      |
-| Company name (if applicable)   |                      |
-| Title or role (if applicable)  |                      |
-| Date                           |                      |
-| GitHub username                |                      |
-| Website (optional)             |                      |
+| Field                          | Entry                              |
+|------------------------------- | --------------------               |
+| Name                           | Abhinav Sharma                     |
+| Company name (if applicable)   | Fourtek I.T. Solutions Pvt. Ltd.   |
+| Title or role (if applicable)  | Machine Learning Engineer          |
+| Date                           | 3 Novermber 2017                   |
+| GitHub username                | abhi18av                           |
+| Website (optional)             | https://abhi18av.github.io/        |