From 1976fb157f13df28817f334adb427ffccca1df6b Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 21:49:57 +0100 Subject: [PATCH 01/21] Update licenses --- website/models/_data.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/models/_data.json b/website/models/_data.json index cb971e20c..05a96a24f 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -76,7 +76,8 @@ "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/", "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/", - "GPL": "http://www.gnu.de/documents/gpl.en.html" + "GPL": "https://www.gnu.org/licenses/gpl.html", + "LGPL": "https://www.gnu.org/licenses/lgpl.html" }, "MODEL_BENCHMARKS": { From 2fa53b39d52149dc412d051bea88daf998fa47d9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:01:06 +0100 Subject: [PATCH 02/21] Add dev dependency --- website/package.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/package.json b/website/package.json index f0a57da4f..fe22e6787 100644 --- a/website/package.json +++ b/website/package.json @@ -9,7 +9,8 @@ "babel-cli": "^6.14.0", "harp": "^0.24.0", "rollup": "^0.50.0", - "uglify-js": "^2.7.3" + "uglify-js": "^2.7.3", + "broken-link-checker": "^0.7.6" }, "dependencies": {}, "scripts": { From 408f450ce019ca6a62dde9d5c539f4c468750b4c Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:01:12 +0100 Subject: [PATCH 03/21] Tidy up --- website/assets/css/_base/_layout.sass | 1 - 1 file changed, 1 deletion(-) diff --git a/website/assets/css/_base/_layout.sass b/website/assets/css/_base/_layout.sass index 1b725fdbf..64fc3808a 100644 --- a/website/assets/css/_base/_layout.sass +++ b/website/assets/css/_base/_layout.sass @@ -12,7 +12,6 @@ body animation: fadeIn 0.25s ease background: $color-back color: $color-front - //scroll-behavior: smooth //- Paragraphs From 3af281a3346315c6320c4c81efe58c495cf5c602 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:02:00 +0100 Subject: [PATCH 04/21] Update test model name --- spacy/tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index ee4093db3..2d1b03514 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -18,7 +18,7 @@ _languages = ['bn', 'da', 'de', 'en', 'es', 'fi', 'fr', 'ga', 'he', 'hu', 'id', 'it', 'nb', 'nl', 'pl', 'pt', 'sv', 'xx'] _models = {'en': ['en_core_web_sm'], 'de': ['de_core_news_md'], - 'fr': ['fr_depvec_web_lg'], + 'fr': ['fr_core_news_sm'], 'xx': ['xx_ent_web_md']} From 819e30a26ef65c070676d903e6bcc52fdb04cba1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:02:45 +0100 Subject: [PATCH 05/21] Tidy up tokenizer exceptions --- spacy/lang/bn/tokenizer_exceptions.py | 2 +- spacy/lang/da/tokenizer_exceptions.py | 3 +-- spacy/lang/de/tokenizer_exceptions.py | 2 +- spacy/lang/en/tokenizer_exceptions.py | 4 ++-- spacy/lang/es/tokenizer_exceptions.py | 2 +- spacy/lang/fi/tokenizer_exceptions.py | 2 +- spacy/lang/fr/tokenizer_exceptions.py | 2 +- spacy/lang/nb/tokenizer_exceptions.py | 2 +- spacy/lang/pl/tokenizer_exceptions.py | 2 +- spacy/lang/sv/tokenizer_exceptions.py | 2 +- spacy/lang/tokenizer_exceptions.py | 2 +- 11 files changed, 12 insertions(+), 13 deletions(-) diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index 5c6de139b..dc1181335 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -20,7 +20,7 @@ for exc_data in [ {ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"}, {ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"}, {ORTH: "মি.লি.", LEMMA: "মিলিলিটার"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index e8edf36b8..c67c038bf 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -8,7 +8,6 @@ _exc = {} for exc_data in [ {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, - {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, @@ -21,7 +20,7 @@ for exc_data in [ {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, {ORTH: "Nov.", LEMMA: "november", NORM: "november"}, {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 0b23a1001..cb16fb06c 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -164,7 +164,7 @@ for exc_data in [ {ORTH: "z.b.", LEMMA: "zum Beispiel"}, {ORTH: "zzgl.", LEMMA: "zuzüglich"}, {ORTH: "österr.", LEMMA: "österreichisch", NORM: "österreichisch"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index 0e5bbc7f6..a76b5fb2b 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -276,7 +276,7 @@ for exc_data in [ exc_data_apos = dict(exc_data) exc_data_apos[ORTH] = "'" + exc_data_apos[ORTH] for data in [exc_data, exc_data_apos]: - _exc[data[ORTH]] = [dict(data)] + _exc[data[ORTH]] = [data] # Times @@ -440,7 +440,7 @@ for exc_data in [ {ORTH: "Va.", LEMMA: "Virginia", NORM: "Virginia"}, {ORTH: "Wash.", LEMMA: "Washington", NORM: "Washington"}, {ORTH: "Wis.", LEMMA: "Wisconsin", NORM: "Wisconsin"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index cb62f008f..d4131ddf6 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -26,7 +26,7 @@ for exc_data in [ {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"}, {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}, {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] # Times diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index 33e223575..88859fefb 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -73,7 +73,7 @@ for exc_data in [ {ORTH: "ts.", LEMMA: "toisin sanoen"}, {ORTH: "vm.", LEMMA: "viimeksi mainittu"}, {ORTH: "srk.", LEMMA: "seurakunta"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 442b367dd..9994686ac 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -54,7 +54,7 @@ for exc_data in [ {LEMMA: "degrés", ORTH: "d°"}, {LEMMA: "saint", ORTH: "St."}, {LEMMA: "sainte", ORTH: "Ste."}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in FR_BASE_EXCEPTIONS + ["etc."]: diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 1529315ca..764866732 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -11,7 +11,7 @@ for exc_data in [ {ORTH: "jan.", LEMMA: "januar"}, {ORTH: "feb.", LEMMA: "februar"}, {ORTH: "jul.", LEMMA: "juli"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index fb87ae8a6..6098c2bb6 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -13,7 +13,7 @@ for exc_data in [ {ORTH: "tzn.", LEMMA: "to znaczy", POS: ADV}, {ORTH: "tj.", LEMMA: "to jest", POS: ADV}, {ORTH: "tzw.", LEMMA: "tak zwany", POS: ADJ}]: - _exc[exc_data[ORTH]] = [dict(exc_data)], + _exc[exc_data[ORTH]] = [exc_data] for orth in [ "w.", "r."]: diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index 0575c3892..64aedf8af 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -68,7 +68,7 @@ for exc_data in [ {ORTH: "Sön.", LEMMA: "Söndag"}, {ORTH: "sthlm", LEMMA: "Stockholm"}, {ORTH: "gbg", LEMMA: "Göteborg"}]: - _exc[exc_data[ORTH]] = [dict(exc_data)] + _exc[exc_data[ORTH]] = [exc_data] for orth in [ diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 73ad88d08..89e1b1476 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -68,7 +68,7 @@ for exc_data in [ {ORTH: "\\n", POS: SPACE}, {ORTH: "\u2014", POS: PUNCT, LEMMA: "--"}, {ORTH: "\u00a0", POS: SPACE, LEMMA: " "}]: - BASE_EXCEPTIONS[exc_data[ORTH]] = [dict(exc_data)] + BASE_EXCEPTIONS[exc_data[ORTH]] = [exc_data] for orth in [ From 18c859500b4797677a2adde351f2047b20b161cc Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:02:51 +0100 Subject: [PATCH 06/21] Add missing imports --- spacy/lang/pl/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index 6098c2bb6..269634671 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals -from ..symbols import ORTH, LEMMA, POS +from ...symbols import ORTH, LEMMA, POS, ADV, ADJ, NOUN _exc = {} From c6fea3e5f61ec02f7df1ddcc44e7cb4ce61c7d6b Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:04:28 +0100 Subject: [PATCH 07/21] Add Romanian and Croatian skeletons (experimental) Add language data templates to make it easier for others to contribute to the language support --- spacy/lang/hr/__init__.py | 27 ++ spacy/lang/hr/stop_words.py | 187 +++++++++++ spacy/lang/ro/__init__.py | 28 ++ spacy/lang/ro/stop_words.py | 442 ++++++++++++++++++++++++++ spacy/lang/ro/tokenizer_exceptions.py | 17 + website/models/_data.json | 2 + 6 files changed, 703 insertions(+) create mode 100644 spacy/lang/hr/__init__.py create mode 100644 spacy/lang/hr/stop_words.py create mode 100644 spacy/lang/ro/__init__.py create mode 100644 spacy/lang/ro/stop_words.py create mode 100644 spacy/lang/ro/tokenizer_exceptions.py diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py new file mode 100644 index 000000000..61b7f38ea --- /dev/null +++ b/spacy/lang/hr/__init__.py @@ -0,0 +1,27 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class CroatianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'hr' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS) + stop_words = STOP_WORDS + + +class Croatian(Language): + lang = 'hr' + Defaults = CroatianDefaults + + +__all__ = ['Croatian'] + diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py new file mode 100644 index 000000000..bf91229a0 --- /dev/null +++ b/spacy/lang/hr/stop_words.py @@ -0,0 +1,187 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-hr + +STOP_WORDS = set(""" +a +ako +ali +bi +bih +bila +bili +bilo +bio +bismo +biste +biti +bumo +da +do +duž +ga +hoće +hoćemo +hoćete +hoćeš +hoću +i +iako +ih +ili +iz +ja +je +jedna +jedne +jedno +jer +jesam +jesi +jesmo +jest +jeste +jesu +jim +joj +još +ju +kada +kako +kao +koja +koje +koji +kojima +koju +kroz +li +me +mene +meni +mi +mimo +moj +moja +moje +mu +na +nad +nakon +nam +nama +nas +naš +naša +naše +našeg +ne +nego +neka +neki +nekog +neku +nema +netko +neće +nećemo +nećete +nećeš +neću +nešto +ni +nije +nikoga +nikoje +nikoju +nisam +nisi +nismo +niste +nisu +njega +njegov +njegova +njegovo +njemu +njezin +njezina +njezino +njih +njihov +njihova +njihovo +njim +njima +njoj +nju +no +o +od +odmah +on +ona +oni +ono +ova +pa +pak +po +pod +pored +prije +s +sa +sam +samo +se +sebe +sebi +si +smo +ste +su +sve +svi +svog +svoj +svoja +svoje +svom +ta +tada +taj +tako +te +tebe +tebi +ti +to +toj +tome +tu +tvoj +tvoja +tvoje +u +uz +vam +vama +vas +vaš +vaša +vaše +već +vi +vrlo +za +zar +će +ćemo +ćete +ćeš +ću +što +""".split()) diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py new file mode 100644 index 000000000..e66fad691 --- /dev/null +++ b/spacy/lang/ro/__init__.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class RomanianDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'ro' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + + +class Romanian(Language): + lang = 'ro' + Defaults = RomanianDefaults + + +__all__ = ['Romanian'] + diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py new file mode 100644 index 000000000..ffaaea7c1 --- /dev/null +++ b/spacy/lang/ro/stop_words.py @@ -0,0 +1,442 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-ro + +STOP_WORDS = set(""" +a +abia +acea +aceasta +această +aceea +aceeasi +acei +aceia +acel +acela +acelasi +acele +acelea +acest +acesta +aceste +acestea +acestei +acestia +acestui +aceşti +aceştia +acolo +acord +acum +adica +ai +aia +aibă +aici +aiurea +al +ala +alaturi +ale +alea +alt +alta +altceva +altcineva +alte +altfel +alti +altii +altul +am +anume +apoi +ar +are +as +asa +asemenea +asta +astazi +astea +astfel +astăzi +asupra +atare +atat +atata +atatea +atatia +ati +atit +atita +atitea +atitia +atunci +au +avea +avem +aveţi +avut +azi +aş +aşadar +aţi +b +ba +bine +bucur +bună +c +ca +cam +cand +capat +care +careia +carora +caruia +cat +catre +caut +ce +cea +ceea +cei +ceilalti +cel +cele +celor +ceva +chiar +ci +cinci +cind +cine +cineva +cit +cita +cite +citeva +citi +citiva +conform +contra +cu +cui +cum +cumva +curând +curînd +când +cât +câte +câtva +câţi +cînd +cît +cîte +cîtva +cîţi +că +căci +cărei +căror +cărui +către +d +da +daca +dacă +dar +dat +datorită +dată +dau +de +deasupra +deci +decit +degraba +deja +deoarece +departe +desi +despre +deşi +din +dinaintea +dintr +dintr- +dintre +doar +doi +doilea +două +drept +dupa +după +dă +e +ea +ei +el +ele +era +eram +este +eu +exact +eşti +f +face +fara +fata +fel +fi +fie +fiecare +fii +fim +fiu +fiţi +foarte +fost +frumos +fără +g +geaba +graţie +h +halbă +i +ia +iar +ieri +ii +il +imi +in +inainte +inapoi +inca +incit +insa +intr +intre +isi +iti +j +k +l +la +le +li +lor +lui +lângă +lîngă +m +ma +mai +mare +mea +mei +mele +mereu +meu +mi +mie +mine +mod +mult +multa +multe +multi +multă +mulţi +mulţumesc +mâine +mîine +mă +n +ne +nevoie +ni +nici +niciodata +nicăieri +nimeni +nimeri +nimic +niste +nişte +noastre +noastră +noi +noroc +nostri +nostru +nou +noua +nouă +noştri +nu +numai +o +opt +or +ori +oricare +orice +oricine +oricum +oricând +oricât +oricînd +oricît +oriunde +p +pai +parca +patra +patru +patrulea +pe +pentru +peste +pic +pina +plus +poate +pot +prea +prima +primul +prin +printr- +putini +puţin +puţina +puţină +până +pînă +r +rog +s +sa +sa-mi +sa-ti +sai +sale +sau +se +si +sint +sintem +spate +spre +sub +sunt +suntem +sunteţi +sus +sută +sînt +sîntem +sînteţi +să +săi +său +t +ta +tale +te +ti +timp +tine +toata +toate +toată +tocmai +tot +toti +totul +totusi +totuşi +toţi +trei +treia +treilea +tu +tuturor +tăi +tău +u +ul +ului +un +una +unde +undeva +unei +uneia +unele +uneori +unii +unor +unora +unu +unui +unuia +unul +v +va +vi +voastre +voastră +voi +vom +vor +vostru +vouă +voştri +vreme +vreo +vreun +vă +x +z +zece +zero +zi +zice +îi +îl +îmi +împotriva +în +înainte +înaintea +încotro +încât +încît +între +întrucât +întrucît +îţi +ăla +ălea +ăsta +ăstea +ăştia +şapte +şase +şi +ştiu +ţi +ţie +""".split()) diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py new file mode 100644 index 000000000..42ccd6a93 --- /dev/null +++ b/spacy/lang/ro/tokenizer_exceptions.py @@ -0,0 +1,17 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH + + +_exc = {} + + +# Source: https://en.wiktionary.org/wiki/Category:Romanian_abbreviations +for orth in [ + "1-a", "1-ul", "10-a", "10-lea", "2-a", "3-a", "3-lea", "6-lea", + "d-voastră", "dvs.", "Rom.", "str."]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/website/models/_data.json b/website/models/_data.json index 05a96a24f..dbbd125d9 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -100,6 +100,8 @@ "da": "Danish", "hu": "Hungarian", "pl": "Polish", + "ro": "Romanian", + "hr": "Croatian", "he": "Hebrew", "ga": "Irish", "bn": "Bengali", From 391fce09d9938aaa442e2d51f462ab744c545076 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 1 Nov 2017 23:04:40 +0100 Subject: [PATCH 08/21] Update licenses --- website/models/_data.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/models/_data.json b/website/models/_data.json index dbbd125d9..62f21dd6f 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -74,6 +74,7 @@ "MODEL_LICENSES": { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-SA 3.0": "https://creativecommons.org/licenses/by-sa/3.0/", + "CC BY-SA 4.0": "https://creativecommons.org/licenses/by-sa/4.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/", "CC BY-NC 3.0": "https://creativecommons.org/licenses/by-nc/3.0/", "GPL": "https://www.gnu.org/licenses/gpl.html", From 15cbc61a6e97162b93dadd680f7a2a5c2d7c7be0 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 2 Nov 2017 16:13:18 +0100 Subject: [PATCH 09/21] Adjust rendering of large numbers 1234 -> 1.2k 12345 -> 12.3k 123456 -> 123k 1234567 -> 1.2m --- website/assets/js/util.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/website/assets/js/util.js b/website/assets/js/util.js index 65d05774c..90e0b5994 100644 --- a/website/assets/js/util.js +++ b/website/assets/js/util.js @@ -59,11 +59,12 @@ export const convertNumber = (num = 0, separator = ',') => * @param {number|string} num - The number to convert. * @param {number} fixed - Number of decimals. */ -export const abbrNumber = (num = 0, fixed = 2) => { +export const abbrNumber = (num = 0, fixed = 1) => { const suffixes = ['', 'k', 'm', 'b', 't']; if (num === null || num === 0) return 0; const b = num.toPrecision(2).split('e'); const k = (b.length === 1) ? 0 : Math.floor(Math.min(b[1].slice(1), 14) / 3); - const c = (k < 1) ? num.toFixed(fixed) : (num / Math.pow(10, k * 3)).toFixed(fixed + 1); + const n = (k < 1) ? num : num / Math.pow(10, k * 3); + const c = (k >= 1 && n >= 100 ) ? Math.round(n) : n.toFixed(fixed); return (c < 0 ? c : Math.abs(c)) + suffixes[k]; } From 31e349a62c06069fd611bd96da88918fe2484180 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 2 Nov 2017 16:13:38 +0100 Subject: [PATCH 10/21] Update model families --- website/models/_data.json | 14 ++++++++------ website/models/nl.jade | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) create mode 100644 website/models/nl.jade diff --git a/website/models/_data.json b/website/models/_data.json index 62f21dd6f..71c2bf7d0 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -12,6 +12,7 @@ "Portuguese": "pt", "French": "fr", "Italian": "it", + "Dutch": "nl", "Multi-Language": "xx" } }, @@ -39,12 +40,13 @@ }, "MODELS": { - "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"], - "de": ["de_dep_news_sm"], - "es": ["es_core_web_sm"], - "pt": [], - "fr": [], - "it": [], + "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"], + "de": ["de_core_news_sm", "de_core_news_md"], + "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"], + "pt": ["pt_core_news_sm"], + "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"], + "it": ["it_core_news_sm"], + "nl": ["nl_core_news_sm"], "xx": ["xx_ent_wiki_sm"] }, diff --git a/website/models/nl.jade b/website/models/nl.jade new file mode 100644 index 000000000..081b4a712 --- /dev/null +++ b/website/models/nl.jade @@ -0,0 +1,6 @@ +//- 💫 DOCS > MODELS > NL + +include ../_includes/_mixins + +//- This is a placeholder. The page is rendered via the template at +//- /_includes/_page-model.jade. From 9baab241b487d93066c3f68e63acec22b013633f Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 2 Nov 2017 16:32:24 +0100 Subject: [PATCH 11/21] Add skeleton language data for Turkish --- spacy/lang/tr/__init__.py | 28 ++ spacy/lang/tr/stop_words.py | 512 ++++++++++++++++++++++++++ spacy/lang/tr/tokenizer_exceptions.py | 27 ++ website/models/_data.json | 1 + 4 files changed, 568 insertions(+) create mode 100644 spacy/lang/tr/__init__.py create mode 100644 spacy/lang/tr/stop_words.py create mode 100644 spacy/lang/tr/tokenizer_exceptions.py diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py new file mode 100644 index 000000000..d1cd04f42 --- /dev/null +++ b/spacy/lang/tr/__init__.py @@ -0,0 +1,28 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS +from .stop_words import STOP_WORDS + +from ..tokenizer_exceptions import BASE_EXCEPTIONS +from ..norm_exceptions import BASE_NORMS +from ...language import Language +from ...attrs import LANG, NORM +from ...util import update_exc, add_lookups + + +class TurkishDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'tr' + lex_attr_getters[NORM] = add_lookups(Language.Defaults.lex_attr_getters[NORM], BASE_NORMS) + tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) + stop_words = STOP_WORDS + + +class Turkish(Language): + lang = 'tr' + Defaults = TurkishDefaults + + +__all__ = ['Turkish'] + diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py new file mode 100644 index 000000000..aaed02a3e --- /dev/null +++ b/spacy/lang/tr/stop_words.py @@ -0,0 +1,512 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# Source: https://github.com/stopwords-iso/stopwords-tr + +STOP_WORDS = set(""" +acaba +acep +adamakıllı +adeta +ait +altmýþ +altmış +altý +altı +ama +amma +anca +ancak +arada +artýk +aslında +aynen +ayrıca +az +açıkça +açıkçası +bana +bari +bazen +bazý +bazı +başkası +baţka +belki +ben +benden +beni +benim +beri +beriki +beþ +beş +beţ +bilcümle +bile +bin +binaen +binaenaleyh +bir +biraz +birazdan +birbiri +birden +birdenbire +biri +birice +birileri +birisi +birkaç +birkaçı +birkez +birlikte +birçok +birçoğu +birþey +birþeyi +birşey +birşeyi +birţey +bitevi +biteviye +bittabi +biz +bizatihi +bizce +bizcileyin +bizden +bize +bizi +bizim +bizimki +bizzat +boşuna +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +buracıkta +burada +buradan +burası +böyle +böylece +böylecene +böylelikle +böylemesine +böylesine +büsbütün +bütün +cuk +cümlesi +da +daha +dahi +dahil +dahilen +daima +dair +dayanarak +de +defa +dek +demin +demincek +deminden +denli +derakap +derhal +derken +deđil +değil +değin +diye +diđer +diğer +diğeri +doksan +dokuz +dolayı +dolayısıyla +doğru +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +elbet +elbette +elli +emme +en +enikonu +epey +epeyce +epeyi +esasen +esnasında +etmesi +etraflı +etraflıca +etti +ettiği +ettiğini +evleviyetle +evvel +evvela +evvelce +evvelden +evvelemirde +evveli +eđer +eğer +fakat +filanca +gah +gayet +gayetle +gayri +gayrı +gelgelelim +gene +gerek +gerçi +geçende +geçenlerde +gibi +gibilerden +gibisinden +gine +göre +gırla +hakeza +halbuki +halen +halihazırda +haliyle +handiyse +hangi +hangisi +hani +hariç +hasebiyle +hasılı +hatta +hele +hem +henüz +hep +hepsi +her +herhangi +herkes +herkesin +hiç +hiçbir +hiçbiri +hoş +hulasaten +iken +iki +ila +ile +ilen +ilgili +ilk +illa +illaki +imdi +indinde +inen +insermi +ise +ister +itibaren +itibariyle +itibarıyla +iyi +iyice +iyicene +için +iş +işte +iţte +kadar +kaffesi +kah +kala +kanýmca +karşın +katrilyon +kaynak +kaçı +kelli +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kere +kez +keza +kezalik +keşke +keţke +ki +kim +kimden +kime +kimi +kimisi +kimse +kimsecik +kimsecikler +külliyen +kýrk +kýsaca +kırk +kısaca +lakin +leh +lütfen +maada +madem +mademki +mamafih +mebni +međer +meğer +meğerki +meğerse +milyar +milyon +mu +mü +mý +mı +nasýl +nasıl +nasılsa +nazaran +naşi +ne +neden +nedeniyle +nedenle +nedense +nerde +nerden +nerdeyse +nere +nerede +nereden +neredeyse +neresi +nereye +netekim +neye +neyi +neyse +nice +nihayet +nihayetinde +nitekim +niye +niçin +o +olan +olarak +oldu +olduklarını +oldukça +olduğu +olduğunu +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +onca +onculayın +onda +ondan +onlar +onlardan +onlari +onlarýn +onları +onların +onu +onun +oracık +oracıkta +orada +oradan +oranca +oranla +oraya +otuz +oysa +oysaki +pek +pekala +peki +pekçe +peyderpey +rağmen +sadece +sahi +sahiden +sana +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +sonra +sonradan +sonraları +sonunda +tabii +tam +tamam +tamamen +tamamıyla +tarafından +tek +trilyon +tüm +var +vardı +vasıtasıyla +ve +velev +velhasıl +velhasılıkelam +veya +veyahut +ya +yahut +yakinen +yakında +yakından +yakınlarda +yalnız +yalnızca +yani +yapacak +yapmak +yaptı +yaptıkları +yaptığı +yaptığını +yapılan +yapılması +yapıyor +yedi +yeniden +yenilerde +yerine +yetmiþ +yetmiş +yetmiţ +yine +yirmi +yok +yoksa +yoluyla +yüz +yüzünden +zarfında +zaten +zati +zira +çabuk +çabukça +çeşitli +çok +çokları +çoklarınca +çokluk +çoklukla +çokça +çoğu +çoğun +çoğunca +çoğunlukla +çünkü +öbür +öbürkü +öbürü +önce +önceden +önceleri +öncelikle +öteki +ötekisi +öyle +öylece +öylelikle +öylemesine +öz +üzere +üç +þey +þeyden +þeyi +þeyler +þu +þuna +þunda +þundan +þunu +şayet +şey +şeyden +şeyi +şeyler +şu +şuna +şuncacık +şunda +şundan +şunlar +şunları +şunu +şunun +şura +şuracık +şuracıkta +şurası +şöyle +ţayet +ţimdi +ţu +ţöyle +""".split()) diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py new file mode 100644 index 000000000..c945c0058 --- /dev/null +++ b/spacy/lang/tr/tokenizer_exceptions.py @@ -0,0 +1,27 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import ORTH, NORM + + +# These exceptions are mostly for example purposes – hoping that Turkish +# speakers can contribute in the future! Source of copy-pasted examples: +# https://en.wiktionary.org/wiki/Category:Turkish_language + +_exc = { + "sağol": [ + {ORTH: "sağ"}, + {ORTH: "ol", NORM: "olun"}] +} + + +for exc_data in [ + {ORTH: "A.B.D.", NORM: "Amerika Birleşik Devletleri"}]: + _exc[exc_data[ORTH]] = [exc_data] + + +for orth in ["Dr."]: + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = _exc diff --git a/website/models/_data.json b/website/models/_data.json index 71c2bf7d0..d64c94074 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -105,6 +105,7 @@ "pl": "Polish", "ro": "Romanian", "hr": "Croatian", + "tr": "Turkish", "he": "Hebrew", "ga": "Irish", "bn": "Bengali", From 43512c68b25e6ff3fc52c46e09fda999511d622d Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 2 Nov 2017 20:04:13 +0100 Subject: [PATCH 12/21] Fix vector details in model overview --- website/_includes/_page_models.jade | 2 ++ website/assets/js/models.js | 32 +++++++++++++------ website/models/_data.json | 1 + website/models/comparison.jade | 2 ++ website/usage/_spacy-101/_word-vectors.jade | 6 ++-- .../usage/_vectors-similarity/_basics.jade | 9 ++---- 6 files changed, 34 insertions(+), 18 deletions(-) diff --git a/website/_includes/_page_models.jade b/website/_includes/_page_models.jade index 1cab930fb..c7742fa38 100644 --- a/website/_includes/_page_models.jade +++ b/website/_includes/_page_models.jade @@ -40,6 +40,8 @@ for id in CURRENT_MODELS each label in ["Pipeline", "Vectors", "Sources", "Author", "License"] - var field = label.toLowerCase() + if field == "vectors" + - field = "vecs" +row +cell.u-nowrap +label=label diff --git a/website/assets/js/models.js b/website/assets/js/models.js index 2d371ee1f..f5757c8cb 100644 --- a/website/assets/js/models.js +++ b/website/assets/js/models.js @@ -20,21 +20,33 @@ const CHART_FONTS = { * @property {function} vectors - Format vector data (entries and dimensions). * @property {function} version - Format model version number. */ -export const formats = { +const formats = { author: (author, url) => url ? `${author}` : author, license: (license, url) => url ? `${license}` : license, sources: sources => (sources instanceof Array) ? sources.join(', ') : sources, pipeline: pipes => (pipes && pipes.length) ? pipes.map(p => `${p}`).join(', ') : '-', - vectors: vec => vec ? `${abbrNumber(vec.keys)} keys, ${abbrNumber(vec.vectors)} unique vectors (${vec.width} dimensions)` : 'n/a', + vectors: vec => formatVectors(vec), version: version => `v${version}` }; +/** + * Format word vectors data depending on contents. + * @property {Object} data - The vectors object from the model's meta.json. + */ +const formatVectors = data => { + if (!data) return 'n/a'; + if (Object.values(data).every(n => n == 0)) return 'context vectors only'; + const { keys, vectors: vecs, width } = data; + return `${abbrNumber(keys)} keys, ${abbrNumber(vecs)} unique vectors (${width} dimensions)`; +} + + /** * Find the latest version of a model in a compatibility table. * @param {string} model - The model name. * @param {Object} compat - Compatibility table, keyed by spaCy version. */ -export const getLatestVersion = (model, compat = {}) => { +const getLatestVersion = (model, compat = {}) => { for (let [spacy_v, models] of Object.entries(compat)) { if (models[model]) return models[model][0]; } @@ -90,7 +102,7 @@ export class ModelLoader { const tpl = new Templater(modelId); tpl.get('table').removeAttribute('data-loading'); tpl.get('error').style.display = 'block'; - for (let key of ['sources', 'pipeline', 'vectors', 'author', 'license']) { + for (let key of ['sources', 'pipeline', 'vecs', 'author', 'license']) { tpl.get(key).parentElement.parentElement.style.display = 'none'; } } @@ -120,8 +132,8 @@ export class ModelLoader { if (author) tpl.fill('author', formats.author(author, url), true); if (license) tpl.fill('license', formats.license(license, this.licenses[license]), true); if (sources) tpl.fill('sources', formats.sources(sources)); - if (vectors) tpl.fill('vectors', formats.vectors(vectors)); - else tpl.get('vectors').parentElement.parentElement.style.display = 'none'; + if (vectors) tpl.fill('vecs', formats.vectors(vectors)); + else tpl.get('vecs').parentElement.parentElement.style.display = 'none'; if (pipeline && pipeline.length) tpl.fill('pipeline', formats.pipeline(pipeline), true); else tpl.get('pipeline').parentElement.parentElement.style.display = 'none'; } @@ -223,8 +235,9 @@ export class ModelComparer { const version = getLatestVersion(name, this.compat); const modelName = `${name}-${version}`; return new Promise((resolve, reject) => { + if (!version) reject(); // resolve immediately if model already loaded, e.g. in this.models - if (this.models[name]) resolve(this.models[name]); + else if (this.models[name]) resolve(this.models[name]); else fetch(`${this.url}/meta/${modelName}.json`) .then(res => handleResponse(res)) .then(json => json.ok ? resolve(this.saveModel(name, json)) : reject()) @@ -306,12 +319,13 @@ export class ModelComparer { this.tpl.fill(`size${i}`, size); this.tpl.fill(`desc${i}`, description || 'n/a'); this.tpl.fill(`pipeline${i}`, formats.pipeline(pipeline), true); - this.tpl.fill(`vectors${i}`, formats.vectors(vectors)); + this.tpl.fill(`vecs${i}`, formats.vectors(vectors)); this.tpl.fill(`sources${i}`, formats.sources(sources)); this.tpl.fill(`author${i}`, formats.author(author, url), true); this.tpl.fill(`license${i}`, formats.license(license, this.licenses[license]), true); // check if model accuracy or speed includes one of the pre-set keys - for (let key of [...metaKeys, ...Object.keys(this.benchKeys.speed)]) { + const allKeys = [].concat(...Object.entries(this.benchKeys).map(([_, v]) => Object.keys(v))); + for (let key of allKeys) { if (accuracy[key]) this.tpl.fill(`${key}${i}`, accuracy[key].toFixed(2)) else if (speed[key]) this.tpl.fill(`${key}${i}`, convertNumber(Math.round(speed[key]))) else this.tpl.fill(`${key}${i}`, 'n/a') diff --git a/website/models/_data.json b/website/models/_data.json index d64c94074..8507a3fa1 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -68,6 +68,7 @@ "gpu": "words per second on GPU", "pipeline": "Processing pipeline components in order", "sources": "Sources of training data", + "vecs": "Word vectors included in the model. Models that only support context vectors compute similarity via the tensors shared with the pipeline.", "benchmark_parser": "Parser accuracy", "benchmark_ner": "NER accuracy", "benchmark_speed": "Speed" diff --git a/website/models/comparison.jade b/website/models/comparison.jade index 881a9aff4..b0ab61efe 100644 --- a/website/models/comparison.jade +++ b/website/models/comparison.jade @@ -53,6 +53,8 @@ div(data-tpl=TPL data-tpl-key="result" style="display: none") for label in ["Version", "Size", "Pipeline", "Vectors", "Sources", "Author", "License"] - var field = label.toLowerCase() + if field == "vectors" + - field = "vecs" +row +cell.u-nowrap +label=label diff --git a/website/usage/_spacy-101/_word-vectors.jade b/website/usage/_spacy-101/_word-vectors.jade index bb9add8a6..c38360014 100644 --- a/website/usage/_spacy-101/_word-vectors.jade +++ b/website/usage/_spacy-101/_word-vectors.jade @@ -4,9 +4,9 @@ p | Similarity is determined by comparing #[strong word vectors] or "word | embeddings", multi-dimensional meaning representations of a word. Word | vectors can be generated using an algorithm like - | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. Most of spaCy's - | #[+a("/models") default models] come with - | #[strong 300-dimensional vectors] that look like this: + | #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec]. spaCy's medium + | #[code md] and large #[code lg] #[+a("/models") models] come with + | #[strong multi-dimensional vectors] that look like this: +code("banana.vector", false, false, 250). array([2.02280000e-01, -7.66180009e-02, 3.70319992e-01, diff --git a/website/usage/_vectors-similarity/_basics.jade b/website/usage/_vectors-similarity/_basics.jade index 07ad6bcd4..734495c6e 100644 --- a/website/usage/_vectors-similarity/_basics.jade +++ b/website/usage/_vectors-similarity/_basics.jade @@ -4,12 +4,9 @@ | Dense, real valued vectors representing distributional similarity | information are now a cornerstone of practical NLP. The most common way | to train these vectors is the #[+a("https://en.wikipedia.org/wiki/Word2vec") word2vec] - | family of algorithms. The default - | #[+a("/models/en") English model] installs - | 300-dimensional vectors trained on the - | #[+a("http://commoncrawl.org") Common Crawl] corpus. - | If you need to train a word2vec model, we recommend the implementation in - | the Python library #[+a("https://radimrehurek.com/gensim/") Gensim]. + | family of algorithms. If you need to train a word2vec model, we recommend + | the implementation in the Python library + | #[+a("https://radimrehurek.com/gensim/") Gensim]. include ../_spacy-101/_similarity include ../_spacy-101/_word-vectors From a22f96c3f19388b5369bf592aac1b056855009f8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Nov 2017 00:48:54 +0100 Subject: [PATCH 13/21] Add test for backpropagating padding --- spacy/tests/test_misc.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 762ea4c08..5c69dae3e 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -6,6 +6,7 @@ from .. import util from ..displacy import parse_deps, parse_ents from ..tokens import Span from .util import get_doc +from .._ml import PrecomputableAffine from pathlib import Path import pytest @@ -59,3 +60,19 @@ def test_displacy_parse_deps(en_vocab): assert deps['arcs'] == [{'start': 0, 'end': 1, 'label': 'nsubj', 'dir': 'left'}, {'start': 2, 'end': 3, 'label': 'det', 'dir': 'left'}, {'start': 1, 'end': 3, 'label': 'attr', 'dir': 'right'}] + + +def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): + model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP) + assert model.W.shape == (nF, nO, nP, nI) + tensor = model.ops.allocate((10, nI)) + Y, get_dX = model.begin_update(tensor) + assert Y.shape == (tensor.shape[0]+1, nF, nO, nP) + assert model.d_pad.shape == (1, nF, nO, nP) + dY = model.ops.allocate((15, nF, nO, nP)) + ids = model.ops.allocate((15, nF)) + ids[1,2] = -1 + dY[1,2] = 1 + assert model.d_pad[0, 2, 0, 0] == 0. + model._backprop_padding(dY, ids) + assert model.d_pad[0, 2, 0, 0] == 1. From 260e6ee3fbc829b04b269f9960e6cb676d0c33ff Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Nov 2017 00:49:11 +0100 Subject: [PATCH 14/21] Improve efficiency of backprop of padding variable --- spacy/_ml.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 8c98567fc..4829631f4 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -150,10 +150,8 @@ class PrecomputableAffine(Model): def _backprop_padding(self, dY, ids): # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0 - for i in range(ids.shape[0]): - for j in range(ids.shape[1]): - if ids[i,j] < 0: - self.d_pad[0,j] += dY[i, j] + d_pad = dY * (ids.reshape((ids.shape[0], self.nF, 1, 1)) < 0.) + self.d_pad += d_pad.sum(axis=0) return dY, ids @staticmethod From 6771780d3f78ab3463fa7255516334059ed2721d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Nov 2017 01:54:34 +0100 Subject: [PATCH 15/21] Fix backprop of padding variable --- spacy/_ml.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index 4829631f4..0f9202603 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -150,7 +150,9 @@ class PrecomputableAffine(Model): def _backprop_padding(self, dY, ids): # (1, nF, nO, nP) += (nN, nF, nO, nP) where IDs (nN, nF) < 0 - d_pad = dY * (ids.reshape((ids.shape[0], self.nF, 1, 1)) < 0.) + mask = ids < 0. + mask = mask.sum(axis=1) + d_pad = dY * mask.reshape((ids.shape[0], 1, 1)) self.d_pad += d_pad.sum(axis=0) return dY, ids From c2bbf076a462326409cb6491752323985332422f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Nov 2017 01:54:54 +0100 Subject: [PATCH 16/21] Add document length cap for training --- spacy/cli/train.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index f489ba7bf..6697ed6c0 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -85,6 +85,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 16), util.env_opt('batch_compound', 1.001)) + max_doc_len = util.env_opt('max_doc_len', 5000) corpus = GoldCorpus(train_path, dev_path, limit=n_sents) n_train_words = corpus.count_train() @@ -108,6 +109,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, with tqdm.tqdm(total=n_train_words, leave=False) as pbar: losses = {} for batch in minibatch(train_docs, size=batch_sizes): + batch = [(d, g) for (d, g) in batch if len(d) < max_doc_len] + if not batch: + continue docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, drop=next(dropout_rates), losses=losses) From d0f88af5b620f25c17c369e7c0bd5ee1b79359fc Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 11:29:04 +0100 Subject: [PATCH 17/21] Hide error earlier --- website/assets/js/models.js | 1 + 1 file changed, 1 insertion(+) diff --git a/website/assets/js/models.js b/website/assets/js/models.js index f5757c8cb..134a0e66c 100644 --- a/website/assets/js/models.js +++ b/website/assets/js/models.js @@ -198,6 +198,7 @@ export class ModelComparer { this.fonts = CHART_FONTS; this.defaultModels = defaultModels; this.tpl.get('result').style.display = 'block'; + this.tpl.get('error').style.display = 'none'; this.fetchCompat() .then(compat => this.init(compat)) .catch(this.showError.bind(this)) From a62b0727d8236ced40a9b98b18914de97fcf9e22 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 11:29:21 +0100 Subject: [PATCH 18/21] Tidy up and always use bundle in built site for now Just to be safe --- website/_includes/_scripts.jade | 76 ++++++++++++++++----------------- 1 file changed, 36 insertions(+), 40 deletions(-) diff --git a/website/_includes/_scripts.jade b/website/_includes/_scripts.jade index 05a468076..0be2e2e98 100644 --- a/website/_includes/_scripts.jade +++ b/website/_includes/_scripts.jade @@ -13,7 +13,6 @@ script(src="/assets/js/vendor/prism.min.js") if SECTION == "models" script(src="/assets/js/vendor/chart.min.js") - script(src="/assets/js/models.js?v#{V_JS}" type="module") script if quickstart @@ -24,15 +23,15 @@ script | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date; | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview'); - -if IS_PAGE - script + if IS_PAGE | ((window.gitter = {}).chat = {}).options = { | useStyles: false, | activationElement: '.js-gitter-button', | targetElement: '.js-gitter', | room: '!{SOCIAL.gitter}' | }; + +if IS_PAGE script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer) @@ -48,39 +47,36 @@ if IS_PAGE - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");" - ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");" -//- Browsers with JS module support. - Will be ignored otherwise. - -script(type="module") - | import ProgressBar from '/assets/js/progress.js'; - !=ProgressBar - if changelog - | import Changelog from '/assets/js/changelog.js'; - !=Changelog - if IS_PAGE - | import NavHighlighter from '/assets/js/nav-highlighter.js'; - !=NavHighlighter - | import GitHubEmbed from '/assets/js/github-embed.js'; - !=GitHubEmbed - if HAS_MODELS - | import { ModelLoader } from '/assets/js/models.js'; - !=ModelLoader - if compare_models - | import { ModelComparer } from '/assets/js/models.js'; - !=ModelComparer - -//- Browsers with no JS module support. - Won't be fetched or interpreted otherwise. - -script(nomodule src="/assets/js/rollup.js") -script(nomodule) - !=ProgressBar - if changelog - !=Changelog - if IS_PAGE - !=NavHighlighter - !=GitHubEmbed - if HAS_MODELS - !=ModeLoader - if compare_models - !=ModelComparer +if environment == "deploy" + //- DEPLOY: use compiled rollup.js and instantiate classes directly + script(src="/assets/js/rollup.js") + script + !=ProgressBar + if changelog + !=Changelog + if IS_PAGE + !=NavHighlighter + !=GitHubEmbed + if HAS_MODELS + !=ModeLoader + if compare_models + !=ModelComparer +else + //- DEVELOPMENT: Use ES6 modules + script(type="module") + | import ProgressBar from '/assets/js/progress.js'; + !=ProgressBar + if changelog + | import Changelog from '/assets/js/changelog.js'; + !=Changelog + if IS_PAGE + | import NavHighlighter from '/assets/js/nav-highlighter.js'; + !=NavHighlighter + | import GitHubEmbed from '/assets/js/github-embed.js'; + !=GitHubEmbed + if HAS_MODELS + | import { ModelLoader } from '/assets/js/models.js'; + !=ModelLoader + if compare_models + | import { ModelComparer } from '/assets/js/models.js'; + !=ModelComparer From 1e163746871c94db7eeef5e5213538883e98820e Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 3 Nov 2017 11:29:34 +0100 Subject: [PATCH 19/21] Update models list to reflect spaCy v2.0.0a18 --- website/models/_data.json | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/website/models/_data.json b/website/models/_data.json index 8507a3fa1..c63101ad0 100644 --- a/website/models/_data.json +++ b/website/models/_data.json @@ -40,13 +40,10 @@ }, "MODELS": { - "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"], - "de": ["de_core_news_sm", "de_core_news_md"], - "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"], - "pt": ["pt_core_news_sm"], - "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"], + "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"], + "de": ["de_core_news_sm"], + "es": ["es_core_news_sm", "es_core_news_md"], "it": ["it_core_news_sm"], - "nl": ["nl_core_news_sm"], "xx": ["xx_ent_wiki_sm"] }, From c740277f9fb7687baa2a8d6a794d9f59b97ca6fb Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Fri, 3 Nov 2017 16:30:44 +0530 Subject: [PATCH 20/21] Minor typo [ nad => and ] --- website/usage/_adding-languages/_language-data.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/usage/_adding-languages/_language-data.jade b/website/usage/_adding-languages/_language-data.jade index dc86b7a03..f0b346886 100644 --- a/website/usage/_adding-languages/_language-data.jade +++ b/website/usage/_adding-languages/_language-data.jade @@ -218,7 +218,7 @@ p | If an exception consists of more than one token, the #[code ORTH] values | combined always need to #[strong match the original string]. The way the | original string is split up can be pretty arbitrary sometimes – for - | example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to"). + | example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to"). | Because of how the tokenizer works, it's currently not possible to split | single-letter strings into multiple tokens. From 2aaf5315f31e146985bffccf753f06610b305c35 Mon Sep 17 00:00:00 2001 From: Abhinav Sharma Date: Fri, 3 Nov 2017 16:56:58 +0530 Subject: [PATCH 21/21] Filled the details of the contribution license --- .github/CONTRIBUTOR_AGREEMENT.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index f34603065..919fb81fc 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [ ] I am signing on behalf of myself as an individual and no other person + * [x] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. @@ -96,11 +96,11 @@ mark both statements: ## Contributor Details -| Field | Entry | -|------------------------------- | -------------------- | -| Name | | -| Company name (if applicable) | | -| Title or role (if applicable) | | -| Date | | -| GitHub username | | -| Website (optional) | | +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Abhinav Sharma | +| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. | +| Title or role (if applicable) | Machine Learning Engineer | +| Date | 3 Novermber 2017 | +| GitHub username | abhi18av | +| Website (optional) | https://abhi18av.github.io/ |