diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md
index f34603065..919fb81fc 100644
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
- * [ ] I am signing on behalf of myself as an individual and no other person
+ * [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
@@ -96,11 +96,11 @@ mark both statements:
## Contributor Details
-| Field | Entry |
-|------------------------------- | -------------------- |
-| Name | |
-| Company name (if applicable) | |
-| Title or role (if applicable) | |
-| Date | |
-| GitHub username | |
-| Website (optional) | |
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Abhinav Sharma |
+| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. |
+| Title or role (if applicable) | Machine Learning Engineer |
+| Date | 3 Novermber 2017 |
+| GitHub username | abhi18av |
+| Website (optional) | https://abhi18av.github.io/ |
diff --git a/spacy/_ml.py b/spacy/_ml.py
index 0b82bbe67..273971b96 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -150,10 +150,10 @@ class PrecomputableAffine(Model):
def _backprop_padding(self, dY, ids):
# (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0
- for i in range(ids.shape[0]):
- for j in range(ids.shape[1]):
- if ids[i,j] < 0:
- self.d_pad[0,j] += dY[i, j]
+ mask = ids < 0.
+ mask = mask.sum(axis=1)
+ d_pad = dY * mask.reshape((ids.shape[0], 1, 1))
+ self.d_pad += d_pad.sum(axis=0)
return dY, ids
@staticmethod
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index f489ba7bf..6697ed6c0 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -85,6 +85,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
batch_sizes = util.compounding(util.env_opt('batch_from', 1),
util.env_opt('batch_to', 16),
util.env_opt('batch_compound', 1.001))
+ max_doc_len = util.env_opt('max_doc_len', 5000)
corpus = GoldCorpus(train_path, dev_path, limit=n_sents)
n_train_words = corpus.count_train()
@@ -108,6 +109,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
losses = {}
for batch in minibatch(train_docs, size=batch_sizes):
+ batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len]
+ if not batch:
+ continue
docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer,
drop=next(dropout_rates), losses=losses)
diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py
index 5c6de139b..dc1181335 100644
--- a/spacy/lang/bn/tokenizer_exceptions.py
+++ b/spacy/lang/bn/tokenizer_exceptions.py
@@ -20,7 +20,7 @@ for exc_data in [
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py
index e8edf36b8..c67c038bf 100644
--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@@ -8,7 +8,6 @@ _exc = {}
for exc_data in [
{ORTH: "Kbh.", LEMMA: "København", NORM: "København"},
-
{ORTH: "Jan.", LEMMA: "januar", NORM: "januar"},
{ORTH: "Feb.", LEMMA: "februar", NORM: "februar"},
{ORTH: "Mar.", LEMMA: "marts", NORM: "marts"},
@@ -21,7 +20,7 @@ for exc_data in [
{ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"},
{ORTH: "Nov.", LEMMA: "november", NORM: "november"},
{ORTH: "Dec.", LEMMA: "december", NORM: "december"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in [
"A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.",
diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py
index 0b23a1001..cb16fb06c 100644
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@@ -164,7 +164,7 @@ for exc_data in [
{ORTH: "z.b.", LEMMA: "zum Beispiel"},
{ORTH: "zzgl.", LEMMA: "zuzüglich"},
{ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in [
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
index 0e5bbc7f6..a76b5fb2b 100644
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@@ -276,7 +276,7 @@ for exc_data in [
exc_data_apos = dict(exc_data)
exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH]
for data in [exc_data, exc_data_apos]:
- _exc[data[ORTH]] = [dict(data)]
+ _exc[data[ORTH]] = [data]
# Times
@@ -440,7 +440,7 @@ for exc_data in [
{ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"},
{ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"},
{ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in [
diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py
index cb62f008f..d4131ddf6 100644
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@@ -26,7 +26,7 @@ for exc_data in [
{ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
{ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
{ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
# Times
diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py
index 33e223575..88859fefb 100644
--- a/spacy/lang/fi/tokenizer_exceptions.py
+++ b/spacy/lang/fi/tokenizer_exceptions.py
@@ -73,7 +73,7 @@ for exc_data in [
{ORTH: "ts.", LEMMA: "toisin sanoen"},
{ORTH: "vm.", LEMMA: "viimeksi mainittu"},
{ORTH: "srk.", LEMMA: "seurakunta"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py
index 442b367dd..9994686ac 100644
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@@ -54,7 +54,7 @@ for exc_data in [
{LEMMA: "degrés", ORTH: "d°"},
{LEMMA: "saint", ORTH: "St."},
{LEMMA: "sainte", ORTH: "Ste."}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in FR_BASE_EXCEPTIONS + ["etc."]:
diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py
new file mode 100644
index 000000000..61b7f38ea
--- /dev/null
+++ b/spacy/lang/hr/__init__.py
@@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class CroatianDefaults(Language.Defaults):
+ lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+ lex_attr_getters[LANG] = lambda text: 'hr'
+ lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+ tokenizer_exceptions = update_exc(BASE_EXCEPTIONS)
+ stop_words = STOP_WORDS
+
+
+class Croatian(Language):
+ lang = 'hr'
+ Defaults = CroatianDefaults
+
+
+__all__ = ['Croatian']
+
diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py
new file mode 100644
index 000000000..bf91229a0
--- /dev/null
+++ b/spacy/lang/hr/stop_words.py
@@ -0,0 +1,187 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-hr
+
+STOP_WORDS = set("""
+a
+ako
+ali
+bi
+bih
+bila
+bili
+bilo
+bio
+bismo
+biste
+biti
+bumo
+da
+do
+duž
+ga
+hoće
+hoćemo
+hoćete
+hoćeš
+hoću
+i
+iako
+ih
+ili
+iz
+ja
+je
+jedna
+jedne
+jedno
+jer
+jesam
+jesi
+jesmo
+jest
+jeste
+jesu
+jim
+joj
+još
+ju
+kada
+kako
+kao
+koja
+koje
+koji
+kojima
+koju
+kroz
+li
+me
+mene
+meni
+mi
+mimo
+moj
+moja
+moje
+mu
+na
+nad
+nakon
+nam
+nama
+nas
+naš
+naša
+naše
+našeg
+ne
+nego
+neka
+neki
+nekog
+neku
+nema
+netko
+neće
+nećemo
+nećete
+nećeš
+neću
+nešto
+ni
+nije
+nikoga
+nikoje
+nikoju
+nisam
+nisi
+nismo
+niste
+nisu
+njega
+njegov
+njegova
+njegovo
+njemu
+njezin
+njezina
+njezino
+njih
+njihov
+njihova
+njihovo
+njim
+njima
+njoj
+nju
+no
+o
+od
+odmah
+on
+ona
+oni
+ono
+ova
+pa
+pak
+po
+pod
+pored
+prije
+s
+sa
+sam
+samo
+se
+sebe
+sebi
+si
+smo
+ste
+su
+sve
+svi
+svog
+svoj
+svoja
+svoje
+svom
+ta
+tada
+taj
+tako
+te
+tebe
+tebi
+ti
+to
+toj
+tome
+tu
+tvoj
+tvoja
+tvoje
+u
+uz
+vam
+vama
+vas
+vaš
+vaša
+vaše
+već
+vi
+vrlo
+za
+zar
+će
+ćemo
+ćete
+ćeš
+ću
+što
+""".split())
diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py
index 1529315ca..764866732 100644
--- a/spacy/lang/nb/tokenizer_exceptions.py
+++ b/spacy/lang/nb/tokenizer_exceptions.py
@@ -11,7 +11,7 @@ for exc_data in [
{ORTH: "jan.", LEMMA: "januar"},
{ORTH: "feb.", LEMMA: "februar"},
{ORTH: "jul.", LEMMA: "juli"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in [
diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py
index fb87ae8a6..269634671 100644
--- a/spacy/lang/pl/tokenizer_exceptions.py
+++ b/spacy/lang/pl/tokenizer_exceptions.py
@@ -1,7 +1,7 @@
# encoding: utf8
from __future__ import unicode_literals
-from ..symbols import ORTH, LEMMA, POS
+from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN
_exc = {}
@@ -13,7 +13,7 @@ for exc_data in [
{ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV},
{ORTH: "tj.", LEMMA: "to jest", POS: ADV},
{ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)],
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in [
"w.", "r."]:
diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py
new file mode 100644
index 000000000..e66fad691
--- /dev/null
+++ b/spacy/lang/ro/__init__.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class RomanianDefaults(Language.Defaults):
+ lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+ lex_attr_getters[LANG] = lambda text: 'ro'
+ lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+ tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+ stop_words = STOP_WORDS
+
+
+class Romanian(Language):
+ lang = 'ro'
+ Defaults = RomanianDefaults
+
+
+__all__ = ['Romanian']
+
diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py
new file mode 100644
index 000000000..ffaaea7c1
--- /dev/null
+++ b/spacy/lang/ro/stop_words.py
@@ -0,0 +1,442 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-ro
+
+STOP_WORDS = set("""
+a
+abia
+acea
+aceasta
+această
+aceea
+aceeasi
+acei
+aceia
+acel
+acela
+acelasi
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+acestei
+acestia
+acestui
+aceşti
+aceştia
+acolo
+acord
+acum
+adica
+ai
+aia
+aibă
+aici
+aiurea
+al
+ala
+alaturi
+ale
+alea
+alt
+alta
+altceva
+altcineva
+alte
+altfel
+alti
+altii
+altul
+am
+anume
+apoi
+ar
+are
+as
+asa
+asemenea
+asta
+astazi
+astea
+astfel
+astăzi
+asupra
+atare
+atat
+atata
+atatea
+atatia
+ati
+atit
+atita
+atitea
+atitia
+atunci
+au
+avea
+avem
+aveţi
+avut
+azi
+aş
+aşadar
+aţi
+b
+ba
+bine
+bucur
+bună
+c
+ca
+cam
+cand
+capat
+care
+careia
+carora
+caruia
+cat
+catre
+caut
+ce
+cea
+ceea
+cei
+ceilalti
+cel
+cele
+celor
+ceva
+chiar
+ci
+cinci
+cind
+cine
+cineva
+cit
+cita
+cite
+citeva
+citi
+citiva
+conform
+contra
+cu
+cui
+cum
+cumva
+curând
+curînd
+când
+cât
+câte
+câtva
+câţi
+cînd
+cît
+cîte
+cîtva
+cîţi
+că
+căci
+cărei
+căror
+cărui
+către
+d
+da
+daca
+dacă
+dar
+dat
+datorită
+dată
+dau
+de
+deasupra
+deci
+decit
+degraba
+deja
+deoarece
+departe
+desi
+despre
+deşi
+din
+dinaintea
+dintr
+dintr-
+dintre
+doar
+doi
+doilea
+două
+drept
+dupa
+după
+dă
+e
+ea
+ei
+el
+ele
+era
+eram
+este
+eu
+exact
+eşti
+f
+face
+fara
+fata
+fel
+fi
+fie
+fiecare
+fii
+fim
+fiu
+fiţi
+foarte
+fost
+frumos
+fără
+g
+geaba
+graţie
+h
+halbă
+i
+ia
+iar
+ieri
+ii
+il
+imi
+in
+inainte
+inapoi
+inca
+incit
+insa
+intr
+intre
+isi
+iti
+j
+k
+l
+la
+le
+li
+lor
+lui
+lângă
+lîngă
+m
+ma
+mai
+mare
+mea
+mei
+mele
+mereu
+meu
+mi
+mie
+mine
+mod
+mult
+multa
+multe
+multi
+multă
+mulţi
+mulţumesc
+mâine
+mîine
+mă
+n
+ne
+nevoie
+ni
+nici
+niciodata
+nicăieri
+nimeni
+nimeri
+nimic
+niste
+nişte
+noastre
+noastră
+noi
+noroc
+nostri
+nostru
+nou
+noua
+nouă
+noştri
+nu
+numai
+o
+opt
+or
+ori
+oricare
+orice
+oricine
+oricum
+oricând
+oricât
+oricînd
+oricît
+oriunde
+p
+pai
+parca
+patra
+patru
+patrulea
+pe
+pentru
+peste
+pic
+pina
+plus
+poate
+pot
+prea
+prima
+primul
+prin
+printr-
+putini
+puţin
+puţina
+puţină
+până
+pînă
+r
+rog
+s
+sa
+sa-mi
+sa-ti
+sai
+sale
+sau
+se
+si
+sint
+sintem
+spate
+spre
+sub
+sunt
+suntem
+sunteţi
+sus
+sută
+sînt
+sîntem
+sînteţi
+să
+săi
+său
+t
+ta
+tale
+te
+ti
+timp
+tine
+toata
+toate
+toată
+tocmai
+tot
+toti
+totul
+totusi
+totuşi
+toţi
+trei
+treia
+treilea
+tu
+tuturor
+tăi
+tău
+u
+ul
+ului
+un
+una
+unde
+undeva
+unei
+uneia
+unele
+uneori
+unii
+unor
+unora
+unu
+unui
+unuia
+unul
+v
+va
+vi
+voastre
+voastră
+voi
+vom
+vor
+vostru
+vouă
+voştri
+vreme
+vreo
+vreun
+vă
+x
+z
+zece
+zero
+zi
+zice
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încotro
+încât
+încît
+între
+întrucât
+întrucît
+îţi
+ăla
+ălea
+ăsta
+ăstea
+ăştia
+şapte
+şase
+şi
+ştiu
+ţi
+ţie
+""".split())
diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py
new file mode 100644
index 000000000..42ccd6a93
--- /dev/null
+++ b/spacy/lang/ro/tokenizer_exceptions.py
@@ -0,0 +1,17 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH
+
+
+_exc = {}
+
+
+# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations
+for orth in [
+ "1-a", "1-ul", "10-a", "10-lea", "2-a", "3-a", "3-lea", "6-lea",
+ "d-voastră", "dvs.", "Rom.", "str."]:
+ _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py
index 0575c3892..64aedf8af 100644
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@@ -68,7 +68,7 @@ for exc_data in [
{ORTH: "Sön.", LEMMA: "Söndag"},
{ORTH: "sthlm", LEMMA: "Stockholm"},
{ORTH: "gbg", LEMMA: "Göteborg"}]:
- _exc[exc_data[ORTH]] = [dict(exc_data)]
+ _exc[exc_data[ORTH]] = [exc_data]
for orth in [
diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py
index 73ad88d08..89e1b1476 100644
--- a/spacy/lang/tokenizer_exceptions.py
+++ b/spacy/lang/tokenizer_exceptions.py
@@ -68,7 +68,7 @@ for exc_data in [
{ORTH: "\\n", POS: SPACE},
{ORTH: "\u2014", POS: PUNCT, LEMMA: "--"},
{ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]:
- BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)]
+ BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data]
for orth in [
diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py
new file mode 100644
index 000000000..d1cd04f42
--- /dev/null
+++ b/spacy/lang/tr/__init__.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class TurkishDefaults(Language.Defaults):
+ lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+ lex_attr_getters[LANG] = lambda text: 'tr'
+ lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS)
+ tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+ stop_words = STOP_WORDS
+
+
+class Turkish(Language):
+ lang = 'tr'
+ Defaults = TurkishDefaults
+
+
+__all__ = ['Turkish']
+
diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py
new file mode 100644
index 000000000..aaed02a3e
--- /dev/null
+++ b/spacy/lang/tr/stop_words.py
@@ -0,0 +1,512 @@
+# encoding: utf8
+from __future__ import unicode_literals
+
+
+# Source: https://github.com/stopwords-iso/stopwords-tr
+
+STOP_WORDS = set("""
+acaba
+acep
+adamakıllı
+adeta
+ait
+altmýþ
+altmış
+altý
+altı
+ama
+amma
+anca
+ancak
+arada
+artýk
+aslında
+aynen
+ayrıca
+az
+açıkça
+açıkçası
+bana
+bari
+bazen
+bazý
+bazı
+başkası
+baţka
+belki
+ben
+benden
+beni
+benim
+beri
+beriki
+beþ
+beş
+beţ
+bilcümle
+bile
+bin
+binaen
+binaenaleyh
+bir
+biraz
+birazdan
+birbiri
+birden
+birdenbire
+biri
+birice
+birileri
+birisi
+birkaç
+birkaçı
+birkez
+birlikte
+birçok
+birçoğu
+birþey
+birþeyi
+birşey
+birşeyi
+birţey
+bitevi
+biteviye
+bittabi
+biz
+bizatihi
+bizce
+bizcileyin
+bizden
+bize
+bizi
+bizim
+bizimki
+bizzat
+boşuna
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+buracıkta
+burada
+buradan
+burası
+böyle
+böylece
+böylecene
+böylelikle
+böylemesine
+böylesine
+büsbütün
+bütün
+cuk
+cümlesi
+da
+daha
+dahi
+dahil
+dahilen
+daima
+dair
+dayanarak
+de
+defa
+dek
+demin
+demincek
+deminden
+denli
+derakap
+derhal
+derken
+deđil
+değil
+değin
+diye
+diđer
+diğer
+diğeri
+doksan
+dokuz
+dolayı
+dolayısıyla
+doğru
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+elbet
+elbette
+elli
+emme
+en
+enikonu
+epey
+epeyce
+epeyi
+esasen
+esnasında
+etmesi
+etraflı
+etraflıca
+etti
+ettiği
+ettiğini
+evleviyetle
+evvel
+evvela
+evvelce
+evvelden
+evvelemirde
+evveli
+eđer
+eğer
+fakat
+filanca
+gah
+gayet
+gayetle
+gayri
+gayrı
+gelgelelim
+gene
+gerek
+gerçi
+geçende
+geçenlerde
+gibi
+gibilerden
+gibisinden
+gine
+göre
+gırla
+hakeza
+halbuki
+halen
+halihazırda
+haliyle
+handiyse
+hangi
+hangisi
+hani
+hariç
+hasebiyle
+hasılı
+hatta
+hele
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkes
+herkesin
+hiç
+hiçbir
+hiçbiri
+hoş
+hulasaten
+iken
+iki
+ila
+ile
+ilen
+ilgili
+ilk
+illa
+illaki
+imdi
+indinde
+inen
+insermi
+ise
+ister
+itibaren
+itibariyle
+itibarıyla
+iyi
+iyice
+iyicene
+için
+iş
+işte
+iţte
+kadar
+kaffesi
+kah
+kala
+kanýmca
+karşın
+katrilyon
+kaynak
+kaçı
+kelli
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kere
+kez
+keza
+kezalik
+keşke
+keţke
+ki
+kim
+kimden
+kime
+kimi
+kimisi
+kimse
+kimsecik
+kimsecikler
+külliyen
+kýrk
+kýsaca
+kırk
+kısaca
+lakin
+leh
+lütfen
+maada
+madem
+mademki
+mamafih
+mebni
+međer
+meğer
+meğerki
+meğerse
+milyar
+milyon
+mu
+mü
+mý
+mı
+nasýl
+nasıl
+nasılsa
+nazaran
+naşi
+ne
+neden
+nedeniyle
+nedenle
+nedense
+nerde
+nerden
+nerdeyse
+nere
+nerede
+nereden
+neredeyse
+neresi
+nereye
+netekim
+neye
+neyi
+neyse
+nice
+nihayet
+nihayetinde
+nitekim
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduklarını
+oldukça
+olduğu
+olduğunu
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+onca
+onculayın
+onda
+ondan
+onlar
+onlardan
+onlari
+onlarýn
+onları
+onların
+onu
+onun
+oracık
+oracıkta
+orada
+oradan
+oranca
+oranla
+oraya
+otuz
+oysa
+oysaki
+pek
+pekala
+peki
+pekçe
+peyderpey
+rağmen
+sadece
+sahi
+sahiden
+sana
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+sonra
+sonradan
+sonraları
+sonunda
+tabii
+tam
+tamam
+tamamen
+tamamıyla
+tarafından
+tek
+trilyon
+tüm
+var
+vardı
+vasıtasıyla
+ve
+velev
+velhasıl
+velhasılıkelam
+veya
+veyahut
+ya
+yahut
+yakinen
+yakında
+yakından
+yakınlarda
+yalnız
+yalnızca
+yani
+yapacak
+yapmak
+yaptı
+yaptıkları
+yaptığı
+yaptığını
+yapılan
+yapılması
+yapıyor
+yedi
+yeniden
+yenilerde
+yerine
+yetmiþ
+yetmiş
+yetmiţ
+yine
+yirmi
+yok
+yoksa
+yoluyla
+yüz
+yüzünden
+zarfında
+zaten
+zati
+zira
+çabuk
+çabukça
+çeşitli
+çok
+çokları
+çoklarınca
+çokluk
+çoklukla
+çokça
+çoğu
+çoğun
+çoğunca
+çoğunlukla
+çünkü
+öbür
+öbürkü
+öbürü
+önce
+önceden
+önceleri
+öncelikle
+öteki
+ötekisi
+öyle
+öylece
+öylelikle
+öylemesine
+öz
+üzere
+üç
+þey
+þeyden
+þeyi
+þeyler
+þu
+þuna
+þunda
+þundan
+þunu
+şayet
+şey
+şeyden
+şeyi
+şeyler
+şu
+şuna
+şuncacık
+şunda
+şundan
+şunlar
+şunları
+şunu
+şunun
+şura
+şuracık
+şuracıkta
+şurası
+şöyle
+ţayet
+ţimdi
+ţu
+ţöyle
+""".split())
diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py
new file mode 100644
index 000000000..c945c0058
--- /dev/null
+++ b/spacy/lang/tr/tokenizer_exceptions.py
@@ -0,0 +1,27 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, NORM
+
+
+# These exceptions are mostly for example purposes – hoping that Turkish
+# speakers can contribute in the future! Source of copy-pasted examples:
+# https://en.wiktionary.org/wiki/Category:Turkish_language
+
+_exc = {
+ "sağol": [
+ {ORTH: "sağ"},
+ {ORTH: "ol", NORM: "olun"}]
+}
+
+
+for exc_data in [
+ {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+
+for orth in ["Dr."]:
+ _exc[orth] = [{ORTH: orth}]
+
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ee4093db3..2d1b03514 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -18,7 +18,7 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id',
'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx']
_models = {'en': ['en_core_web_sm'],
'de': ['de_core_news_md'],
- 'fr': ['fr_depvec_web_lg'],
+ 'fr': ['fr_core_news_sm'],
'xx': ['xx_ent_web_md']}
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index 762ea4c08..5c69dae3e 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -6,6 +6,7 @@ from .. import util
from ..displacy import parse_deps, parse_ents
from ..tokens import Span
from .util import get_doc
+from .._ml import PrecomputableAffine
from pathlib import Path
import pytest
@@ -59,3 +60,19 @@ def test_displacy_parse_deps(en_vocab):
assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'},
{'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'},
{'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}]
+
+
+def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
+ model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
+ assert model.W.shape == (nF, nO, nP, nI)
+ tensor = model.ops.allocate((10, nI))
+ Y, get_dX = model.begin_update(tensor)
+ assert Y.shape == (tensor.shape[0]+1, nF, nO, nP)
+ assert model.d_pad.shape == (1, nF, nO, nP)
+ dY = model.ops.allocate((15, nF, nO, nP))
+ ids = model.ops.allocate((15, nF))
+ ids[1,2] = -1
+ dY[1,2] = 1
+ assert model.d_pad[0, 2, 0, 0] == 0.
+ model._backprop_padding(dY, ids)
+ assert model.d_pad[0, 2, 0, 0] == 1.
diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade
index 1cab930fb..c7742fa38 100644
--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@@ -40,6 +40,8 @@ for id in CURRENT_MODELS
each label in ["Pipeline", "Vectors", "Sources", "Author", "License"]
- var field = label.toLowerCase()
+ if field == "vectors"
+ - field = "vecs"
+row
+cell.u-nowrap
+label=label
diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade
index 05a468076..0be2e2e98 100644
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@@ -13,7 +13,6 @@ script(src="/assets/js/vendor/prism.min.js")
if SECTION == "models"
script(src="/assets/js/vendor/chart.min.js")
- script(src="/assets/js/models.js?v#{V_JS}" type="module")
script
if quickstart
@@ -24,15 +23,15 @@ script
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
-
-if IS_PAGE
- script
+ if IS_PAGE
| ((window.gitter = {}).chat = {}).options = {
| useStyles: false,
| activationElement: '.js-gitter-button',
| targetElement: '.js-gitter',
| room: '!{SOCIAL.gitter}'
| };
+
+if IS_PAGE
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
@@ -48,39 +47,36 @@ if IS_PAGE
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
-//- Browsers with JS module support.
- Will be ignored otherwise.
-
-script(type="module")
- | import ProgressBar from '/assets/js/progress.js';
- !=ProgressBar
- if changelog
- | import Changelog from '/assets/js/changelog.js';
- !=Changelog
- if IS_PAGE
- | import NavHighlighter from '/assets/js/nav-highlighter.js';
- !=NavHighlighter
- | import GitHubEmbed from '/assets/js/github-embed.js';
- !=GitHubEmbed
- if HAS_MODELS
- | import { ModelLoader } from '/assets/js/models.js';
- !=ModelLoader
- if compare_models
- | import { ModelComparer } from '/assets/js/models.js';
- !=ModelComparer
-
-//- Browsers with no JS module support.
- Won't be fetched or interpreted otherwise.
-
-script(nomodule src="/assets/js/rollup.js")
-script(nomodule)
- !=ProgressBar
- if changelog
- !=Changelog
- if IS_PAGE
- !=NavHighlighter
- !=GitHubEmbed
- if HAS_MODELS
- !=ModeLoader
- if compare_models
- !=ModelComparer
+if environment == "deploy"
+ //- DEPLOY: use compiled rollup.js and instantiate classes directly
+ script(src="/assets/js/rollup.js")
+ script
+ !=ProgressBar
+ if changelog
+ !=Changelog
+ if IS_PAGE
+ !=NavHighlighter
+ !=GitHubEmbed
+ if HAS_MODELS
+ !=ModeLoader
+ if compare_models
+ !=ModelComparer
+else
+ //- DEVELOPMENT: Use ES6 modules
+ script(type="module")
+ | import ProgressBar from '/assets/js/progress.js';
+ !=ProgressBar
+ if changelog
+ | import Changelog from '/assets/js/changelog.js';
+ !=Changelog
+ if IS_PAGE
+ | import NavHighlighter from '/assets/js/nav-highlighter.js';
+ !=NavHighlighter
+ | import GitHubEmbed from '/assets/js/github-embed.js';
+ !=GitHubEmbed
+ if HAS_MODELS
+ | import { ModelLoader } from '/assets/js/models.js';
+ !=ModelLoader
+ if compare_models
+ | import { ModelComparer } from '/assets/js/models.js';
+ !=ModelComparer
diff --git a/website/assets/css/_base/_layout.sass b/website/assets/css/_base/_layout.sass
index 1b725fdbf..64fc3808a 100644
--- a/website/assets/css/_base/_layout.sass
+++ b/website/assets/css/_base/_layout.sass
@@ -12,7 +12,6 @@ body
animation: fadeIn 0.25s ease
background: $color-back
color: $color-front
- //scroll-behavior: smooth
//- Paragraphs
diff --git a/website/assets/js/models.js b/website/assets/js/models.js
index 2d371ee1f..134a0e66c 100644
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@@ -20,21 +20,33 @@ const CHART_FONTS = {
* @property {function} vectors - Format vector data (entries and dimensions).
* @property {function} version - Format model version number.
*/
-export const formats = {
+const formats = {
author: (author, url) => url ? `${author}` : author,
license: (license, url) => url ? `${license}` : license,
sources: sources => (sources instanceof Array) ? sources.join(', ') : sources,
pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}
`).join(', ') : '-',
- vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a',
+ vectors: vec => formatVectors(vec),
version: version => `v${version}
`
};
+/**
+ * Format word vectors data depending on contents.
+ * @property {Object} data - The vectors object from the model's meta.json.
+ */
+const formatVectors = data => {
+ if (!data) return 'n/a';
+ if (Object.values(data).every(n => n == 0)) return 'context vectors only';
+ const { keys, vectors: vecs, width } = data;
+ return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`;
+}
+
+
/**
* Find the latest version of a model in a compatibility table.
* @param {string} model - The model name.
* @param {Object} compat - Compatibility table, keyed by spaCy version.
*/
-export const getLatestVersion = (model, compat = {}) => {
+const getLatestVersion = (model, compat = {}) => {
for (let [spacy_v, models] of Object.entries(compat)) {
if (models[model]) return models[model][0];
}
@@ -90,7 +102,7 @@ export class ModelLoader {
const tpl = new Templater(modelId);
tpl.get('table').removeAttribute('data-loading');
tpl.get('error').style.display = 'block';
- for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) {
+ for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) {
tpl.get(key).parentElement.parentElement.style.display = 'none';
}
}
@@ -120,8 +132,8 @@ export class ModelLoader {
if (author) tpl.fill('author', formats.author(author, url), true);
if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true);
if (sources) tpl.fill('sources', formats.sources(sources));
- if (vectors) tpl.fill('vectors', formats.vectors(vectors));
- else tpl.get('vectors').parentElement.parentElement.style.display = 'none';
+ if (vectors) tpl.fill('vecs', formats.vectors(vectors));
+ else tpl.get('vecs').parentElement.parentElement.style.display = 'none';
if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true);
else tpl.get('pipeline').parentElement.parentElement.style.display = 'none';
}
@@ -186,6 +198,7 @@ export class ModelComparer {
this.fonts = CHART_FONTS;
this.defaultModels = defaultModels;
this.tpl.get('result').style.display = 'block';
+ this.tpl.get('error').style.display = 'none';
this.fetchCompat()
.then(compat => this.init(compat))
.catch(this.showError.bind(this))
@@ -223,8 +236,9 @@ export class ModelComparer {
const version = getLatestVersion(name, this.compat);
const modelName = `${name}-${version}`;
return new Promise((resolve, reject) => {
+ if (!version) reject();
// resolve immediately if model already loaded, e.g. in this.models
- if (this.models[name]) resolve(this.models[name]);
+ else if (this.models[name]) resolve(this.models[name]);
else fetch(`${this.url}/meta/${modelName}.json`)
.then(res => handleResponse(res))
.then(json => json.ok ? resolve(this.saveModel(name, json)) : reject())
@@ -306,12 +320,13 @@ export class ModelComparer {
this.tpl.fill(`size${i}`, size);
this.tpl.fill(`desc${i}`, description || 'n/a');
this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true);
- this.tpl.fill(`vectors${i}`, formats.vectors(vectors));
+ this.tpl.fill(`vecs${i}`, formats.vectors(vectors));
this.tpl.fill(`sources${i}`, formats.sources(sources));
this.tpl.fill(`author${i}`, formats.author(author, url), true);
this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true);
// check if model accuracy or speed includes one of the pre-set keys
- for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) {
+ const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v)));
+ for (let key of allKeys) {
if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2))
else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key])))
else this.tpl.fill(`${key}${i}`, 'n/a')
diff --git a/website/assets/js/util.js b/website/assets/js/util.js
index 65d05774c..90e0b5994 100644
--- a/website/assets/js/util.js
+++ b/website/assets/js/util.js
@@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') =>
* @param {number|string} num - The number to convert.
* @param {number} fixed - Number of decimals.
*/
-export const abbrNumber = (num = 0, fixed = 2) => {
+export const abbrNumber = (num = 0, fixed = 1) => {
const suffixes = ['', 'k', 'm', 'b', 't'];
if (num === null || num === 0) return 0;
const b = num.toPrecision(2).split('e');
const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3);
- const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1);
+ const n = (k < 1) ? num : num / Math.pow(10, k * 3);
+ const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed);
return (c < 0 ? c : Math.abs(c)) + suffixes[k];
}
diff --git a/website/models/_data.json b/website/models/_data.json
index cb971e20c..c63101ad0 100644
--- a/website/models/_data.json
+++ b/website/models/_data.json
@@ -12,6 +12,7 @@
"Portuguese": "pt",
"French": "fr",
"Italian": "it",
+ "Dutch": "nl",
"Multi-Language": "xx"
}
},
@@ -40,11 +41,9 @@
"MODELS": {
"en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
- "de": ["de_dep_news_sm"],
- "es": ["es_core_web_sm"],
- "pt": [],
- "fr": [],
- "it": [],
+ "de": ["de_core_news_sm"],
+ "es": ["es_core_news_sm", "es_core_news_md"],
+ "it": ["it_core_news_sm"],
"xx": ["xx_ent_wiki_sm"]
},
@@ -66,6 +65,7 @@
"gpu": "words per second on GPU",
"pipeline": "Processing pipeline components in order",
"sources": "Sources of training data",
+ "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.",
"benchmark_parser": "Parser accuracy",
"benchmark_ner": "NER accuracy",
"benchmark_speed": "Speed"
@@ -74,9 +74,11 @@
"MODEL_LICENSES": {
"CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/",
"CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/",
+ "CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/",
"CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/",
"CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/",
- "GPL": "http://www.gnu.de/documents/gpl.en.html"
+ "GPL": "https://www.gnu.org/licenses/gpl.html",
+ "LGPL": "https://www.gnu.org/licenses/lgpl.html"
},
"MODEL_BENCHMARKS": {
@@ -99,6 +101,9 @@
"da": "Danish",
"hu": "Hungarian",
"pl": "Polish",
+ "ro": "Romanian",
+ "hr": "Croatian",
+ "tr": "Turkish",
"he": "Hebrew",
"ga": "Irish",
"bn": "Bengali",
diff --git a/website/models/comparison.jade b/website/models/comparison.jade
index 881a9aff4..b0ab61efe 100644
--- a/website/models/comparison.jade
+++ b/website/models/comparison.jade
@@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none")
for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"]
- var field = label.toLowerCase()
+ if field == "vectors"
+ - field = "vecs"
+row
+cell.u-nowrap
+label=label
diff --git a/website/models/nl.jade b/website/models/nl.jade
new file mode 100644
index 000000000..081b4a712
--- /dev/null
+++ b/website/models/nl.jade
@@ -0,0 +1,6 @@
+//- 💫 DOCS > MODELS > NL
+
+include ../_includes/_mixins
+
+//- This is a placeholder. The page is rendered via the template at
+//- /_includes/_page-model.jade.
diff --git a/website/package.json b/website/package.json
index f0a57da4f..fe22e6787 100644
--- a/website/package.json
+++ b/website/package.json
@@ -9,7 +9,8 @@
"babel-cli": "^6.14.0",
"harp": "^0.24.0",
"rollup": "^0.50.0",
- "uglify-js": "^2.7.3"
+ "uglify-js": "^2.7.3",
+ "broken-link-checker": "^0.7.6"
},
"dependencies": {},
"scripts": {
diff --git a/website/usage/_adding-languages/_language-data.jade b/website/usage/_adding-languages/_language-data.jade
index dc86b7a03..f0b346886 100644
--- a/website/usage/_adding-languages/_language-data.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@@ -218,7 +218,7 @@ p
| If an exception consists of more than one token, the #[code ORTH] values
| combined always need to #[strong match the original string]. The way the
| original string is split up can be pretty arbitrary sometimes – for
- | example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
+ | example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
| Because of how the tokenizer works, it's currently not possible to split
| single-letter strings into multiple tokens.
diff --git a/website/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade
index bb9add8a6..c38360014 100644
--- a/website/usage/_spacy-101/_word-vectors.jade
+++ b/website/usage/_spacy-101/_word-vectors.jade
@@ -4,9 +4,9 @@ p
| Similarity is determined by comparing #[strong word vectors] or "word
| embeddings", multi-dimensional meaning representations of a word. Word
| vectors can be generated using an algorithm like
- | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's
- | #[+a("/models") default models] come with
- | #[strong 300-dimensional vectors] that look like this:
+ | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium
+ | #[code md] and large #[code lg] #[+a("/models") models] come with
+ | #[strong multi-dimensional vectors] that look like this:
+code("banana.vector", false, false, 250).
array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01,
diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade
index 07ad6bcd4..734495c6e 100644
--- a/website/usage/_vectors-similarity/_basics.jade
+++ b/website/usage/_vectors-similarity/_basics.jade
@@ -4,12 +4,9 @@
| Dense, real valued vectors representing distributional similarity
| information are now a cornerstone of practical NLP. The most common way
| to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]
- | family of algorithms. The default
- | #[+a("/models/en") English model] installs
- | 300-dimensional vectors trained on the
- | #[+a("http://commoncrawl.org") Common Crawl] corpus.
- | If you need to train a word2vec model, we recommend the implementation in
- | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim].
+ | family of algorithms. If you need to train a word2vec model, we recommend
+ | the implementation in the Python library
+ | #[+a("https://radimrehurek.com/gensim/") Gensim].
include ../_spacy-101/_similarity
include ../_spacy-101/_word-vectors