From c6dc2fafc02cd1a5593ed2825dc0f7f55a6ac87e Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 17:49:56 +0200 Subject: [PATCH 1/5] Add Spanish and move example sentences to meta --- website/_harp.json | 16 ++++++++++++++-- website/docs/usage/index.jade | 1 + website/docs/usage/models.jade | 3 +-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index 8c16ccc16..25ad3c5d2 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -77,7 +77,8 @@ { "id": "model", "title": "Models", "multiple": true, "options": [ { "id": "en", "title": "English", "meta": "50MB" }, { "id": "de", "title": "German", "meta": "645MB" }, - { "id": "fr", "title": "French", "meta": "1.33GB" }] + { "id": "fr", "title": "French", "meta": "1.33GB" }, + { "id": "es", "title": "Spanish", "meta": "377MB"}] } ], @@ -85,7 +86,8 @@ { "id": "lang", "title": "Language", "options": [ { "id": "en", "title": "English", "checked": true }, { "id": "de", "title": "German" }, - { "id": "fr", "title": "French" }] + { "id": "fr", "title": "French" }, + { "id": "es", "title": "Spanish" }] }, { "id": "load", "title": "Loading style", "options": [ { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." }, @@ -108,9 +110,19 @@ ], "fr": [ { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" } + ], + "es": [ + { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"} ] }, + "EXAMPLE_SENTENCES": { + "en": "This is a sentence.", + "de": "Dies ist ein Satz.", + "fr": "C'est une phrase.", + "es": "Esto es una frase." + }, + "ALPHA": true, "V_CSS": "1.6", "V_JS": "1.2", diff --git a/website/docs/usage/index.jade b/website/docs/usage/index.jade index c79c689a4..d3deaa17e 100644 --- a/website/docs/usage/index.jade +++ b/website/docs/usage/index.jade @@ -40,6 +40,7 @@ p +qs({model: 'en'}) python -m spacy download en +qs({model: 'de'}) python -m spacy download de +qs({model: 'fr'}) python -m spacy download fr + +qs({model: 'es'}) python -m spacy download es +h(2, "installation") Installation instructions diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index a837b4d29..bc0f14e01 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -18,7 +18,6 @@ p | skew, which might decrease your accuracy. +quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.") - - var examples = {en: "This is a sentence.", de: "Dies ist ein Satz.", fr: "C'est une phrase."} for models, lang in MODELS - var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def }) +qs({lang: lang}) python -m spacy download #{lang} @@ -26,7 +25,7 @@ p +qs({lang: lang, load: "module"}, "python") import #{package.id} +qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load() +qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}') - +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{examples[lang]}") + +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}") +qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc]) +h(2, "available") Available models From 6c908700c45f0a109e8fd1a66a2ecce0d172c93e Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 18:20:33 +0200 Subject: [PATCH 2/5] Add alpha badge --- website/assets/img/graphics.svg | 11 +++++++++++ website/index.jade | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg index c24473b4c..a449c3d04 100644 --- a/website/assets/img/graphics.svg +++ b/website/assets/img/graphics.svg @@ -1,5 +1,16 @@ + + spaCy v2.0.0 alpha + + + + + + + + + spaCy user survey 2017 diff --git a/website/index.jade b/website/index.jade index b4e987cfb..741db53cf 100644 --- a/website/index.jade +++ b/website/index.jade @@ -11,7 +11,7 @@ include _includes/_mixins h2.c-landing__title.o-block.u-heading-1 | in Python - +landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!") + +landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!") +grid.o-content +grid-col("third").o-card From 8bee34126dfd2735485dc82134b23547438394bd Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 18:22:35 +0200 Subject: [PATCH 3/5] Update model size --- website/_harp.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_harp.json b/website/_harp.json index 25ad3c5d2..07afcbaa2 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -78,7 +78,7 @@ { "id": "en", "title": "English", "meta": "50MB" }, { "id": "de", "title": "German", "meta": "645MB" }, { "id": "fr", "title": "French", "meta": "1.33GB" }, - { "id": "es", "title": "Spanish", "meta": "377MB"}] + { "id": "es", "title": "Spanish", "meta": "378MB"}] } ], @@ -112,7 +112,7 @@ { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" } ], "es": [ - { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "377 MB", "license": "CC BY-SA"} + { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "378 MB", "license": "CC BY-SA"} ] }, From 9064fbbf1ecef918c10f9293447a8fd3fd2015c6 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 1 Jun 2017 18:57:02 +0200 Subject: [PATCH 4/5] Fix empty arguments in mixins --- website/_includes/_mixins.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index ce8bfad4e..9de43b092 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -107,13 +107,13 @@ mixin button(url, trusted, ...style) height - [integer] optional height to clip code block to mixin code(label, language, icon, height) - pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes) + pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes) if label h4.u-text-label.u-text-label--dark=label if icon - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'} - .c-code-block__icon(class=classes[icon] || "" class=classes[icon] ? "c-code-block__icon--border" : "") + .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null) +icon(icon, 18) code.c-code-block__content From 307d615c5f81fa4bbc8de432c468f7c37d5a3dc9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 1 Jun 2017 12:18:36 -0500 Subject: [PATCH 5/5] Fix serialization for tagger when tag_map has changed --- spacy/pipeline.pyx | 36 +++++++++++++++++++++++++++--------- spacy/vocab.pyx | 5 ----- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index aeec2dba4..d4d94a476 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -10,6 +10,7 @@ cimport numpy as np import cytoolz import util from collections import OrderedDict +import ujson from thinc.api import add, layerize, chain, clone, concatenate, with_flatten from thinc.neural import Model, Maxout, Softmax, Affine @@ -33,6 +34,7 @@ from .gold cimport GoldParse from .morphology cimport Morphology from .vocab cimport Vocab from .syntax import nonproj +from .compat import json_dumps from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats @@ -308,7 +310,7 @@ class NeuralTagger(object): if self.model is True: token_vector_width = util.env_opt('token_vector_width', 128) self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) - self.model.from_bytes(b) + self.model.from_bytes(b) deserialize = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('model', lambda b: load_model(b)), @@ -317,17 +319,33 @@ class NeuralTagger(object): return self def to_disk(self, path, **exclude): - serialize = { - 'model': lambda p: p.open('wb').write(self.model.to_bytes()), - 'vocab': lambda p: self.vocab.to_disk(p) - } + serialize = OrderedDict(( + ('vocab', lambda p: self.vocab.to_disk(p)), + ('tag_map', lambda p: p.open('w').write(json_dumps( + self.vocab.morphology.tag_map))), + ('model', lambda p: p.open('wb').write(self.model.to_bytes())), + )) util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): - deserialize = { - 'model': lambda p: self.model.from_bytes(p.open('rb').read()), - 'vocab': lambda p: self.vocab.from_disk(p) - } + def load_model(p): + if self.model is True: + token_vector_width = util.env_opt('token_vector_width', 128) + self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width) + self.model.from_bytes(p.open('rb').read()) + + def load_tag_map(p): + with p.open() as file_: + tag_map = ujson.loads(file_.read()) + self.vocab.morphology = Morphology( + self.vocab.strings, tag_map=tag_map, + lemmatizer=self.vocab.morphology.lemmatizer) + + deserialize = OrderedDict(( + ('vocab', lambda p: self.vocab.from_disk(p)), + ('tag_map', load_tag_map), + ('model', load_model), + )) util.from_disk(path, deserialize, exclude) return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index b3410a02b..d42e8951b 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -315,7 +315,6 @@ cdef class Vocab: getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), ('lexemes', lambda: self.lexemes_to_bytes()), - ('tag_map', lambda: self.morphology.tag_map), )) return util.to_bytes(getters, exclude) @@ -326,13 +325,9 @@ cdef class Vocab: **exclude: Named attributes to prevent from being loaded. RETURNS (Vocab): The `Vocab` object. """ - def set_tag_map(tag_map): - self.morphology = Morphology(self.strings, tag_map, - self.morphology.lemmatizer) setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), - ('tag_map', lambda b: set_tag_map(b)) )) return util.from_bytes(bytes_data, setters, exclude)