Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2017-06-01 13:03:57 -05:00 · 2017-06-01 13:03:57 -05:00 · c650bc481c
parent 1d18cedae8 c8f4bed36f
commit c650bc481c
8 changed files with 57 additions and 21 deletions
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -10,6 +10,7 @@ cimport numpy as np
 import cytoolz
 import util
 from collections import OrderedDict
+import ujson

 from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
 from thinc.neural import Model, Maxout, Softmax, Affine
@ -33,6 +34,7 @@ from .gold cimport GoldParse
 from .morphology cimport Morphology
 from .vocab cimport Vocab
 from .syntax import nonproj
+from .compat import json_dumps

 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
@ -308,7 +310,7 @@ class NeuralTagger(object):
            if self.model is True:
                token_vector_width = util.env_opt('token_vector_width', 128)
                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
-                self.model.from_bytes(b)
+            self.model.from_bytes(b)
        deserialize = OrderedDict((
            ('vocab', lambda b: self.vocab.from_bytes(b)),
            ('model', lambda b: load_model(b)),
@ -317,17 +319,33 @@ class NeuralTagger(object):
        return self

    def to_disk(self, path, **exclude):
-        serialize = {
-            'model': lambda p: p.open('wb').write(self.model.to_bytes()),
-            'vocab': lambda p: self.vocab.to_disk(p)
-        }
+        serialize = OrderedDict((
+            ('vocab', lambda p: self.vocab.to_disk(p)),
+            ('tag_map', lambda p: p.open('w').write(json_dumps(
+                self.vocab.morphology.tag_map))),
+            ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
+        ))
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, **exclude):
-        deserialize = {
-            'model': lambda p: self.model.from_bytes(p.open('rb').read()),
-            'vocab': lambda p: self.vocab.from_disk(p)
-        }
+        def load_model(p):
+            if self.model is True:
+                token_vector_width = util.env_opt('token_vector_width', 128)
+                self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
+            self.model.from_bytes(p.open('rb').read())
+
+        def load_tag_map(p):
+            with p.open() as file_:
+                tag_map = ujson.loads(file_.read())
+            self.vocab.morphology = Morphology(
+                self.vocab.strings, tag_map=tag_map,
+                lemmatizer=self.vocab.morphology.lemmatizer)
+
+        deserialize = OrderedDict((
+            ('vocab', lambda p: self.vocab.from_disk(p)),
+            ('tag_map', load_tag_map),
+            ('model', load_model),
+        ))
        util.from_disk(path, deserialize, exclude)
        return self

--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -315,7 +315,6 @@ cdef class Vocab:
        getters = OrderedDict((
            ('strings', lambda: self.strings.to_bytes()),
            ('lexemes', lambda: self.lexemes_to_bytes()),
-            ('tag_map', lambda: self.morphology.tag_map),
        ))
        return util.to_bytes(getters, exclude)

@ -326,13 +325,9 @@ cdef class Vocab:
        **exclude: Named attributes to prevent from being loaded.
        RETURNS (Vocab): The `Vocab` object.
        """
-        def set_tag_map(tag_map):
-            self.morphology = Morphology(self.strings, tag_map,
-                                        self.morphology.lemmatizer)
        setters = OrderedDict((
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
-            ('tag_map', lambda b: set_tag_map(b))
        ))
        return util.from_bytes(bytes_data, setters, exclude)

--- a/website/_harp.json
+++ b/website/_harp.json
@ -77,7 +77,8 @@
            { "id": "model", "title": "Models", "multiple": true, "options": [
                { "id": "en", "title": "English", "meta": "50MB" },
                { "id": "de", "title": "German", "meta": "645MB" },
-                { "id": "fr", "title": "French", "meta": "1.33GB" }]
+                { "id": "fr", "title": "French", "meta": "1.33GB" },
+                { "id": "es", "title": "Spanish", "meta": "378MB"}]
            }
        ],

@ -85,7 +86,8 @@
            { "id": "lang", "title": "Language", "options": [
                { "id": "en", "title": "English", "checked": true },
                { "id": "de", "title": "German" },
-                { "id": "fr", "title": "French" }]
+                { "id": "fr", "title": "French" },
+                { "id": "es", "title": "Spanish" }]
            },
            { "id": "load", "title": "Loading style", "options": [
                { "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
@ -108,9 +110,19 @@
            ],
            "fr": [
                { "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
+            ],
+            "es": [
+                { "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "378 MB", "license": "CC BY-SA"}
            ]
        },

+        "EXAMPLE_SENTENCES": {
+            "en": "This is a sentence.",
+            "de": "Dies ist ein Satz.",
+            "fr": "C'est une phrase.",
+            "es": "Esto es una frase."
+        },
+
        "ALPHA": true,
        "V_CSS": "1.6",
        "V_JS": "1.2",
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -107,13 +107,13 @@ mixin button(url, trusted, ...style)
    height   - [integer] optional height to clip code block to

 mixin code(label, language, icon, height)
-    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes)
+    pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
        if label
            h4.u-text-label.u-text-label--dark=label

        if icon
            - var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
-            .c-code-block__icon(class=classes[icon] || "" class=classes[icon] ? "c-code-block__icon--border" : "")
+            .c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
                +icon(icon, 18)

        code.c-code-block__content
--- a/website/assets/img/graphics.svg
+++ b/website/assets/img/graphics.svg
@ -1,5 +1,16 @@
 <svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
    <defs>
+        <symbol id="v2alpha" viewBox="0 0 200 111">
+            <title>spaCy v2.0.0 alpha</title>
+            <path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
+            <path fill="#a3cad3" d="M45.4 105.4L19.6 94.6l25.4-1"/>
+            <path fill="#ddd" d="M196.6 2L155 3.4l1 29.2 41.6-1.4L187.2 17"/>
+            <path fill="#a3cad3" d="M155 3.4l25.8 10.8-25.4 1"/>
+            <path fill="#fff" d="M17.6 19.4l163-5.6 1 29.2-163 5.6zM19.2 65.6l163-5.6 1 29.2-163 5.6z"/>
+            <path fill="#008EBC" d="M45.8 29h-3.6v-2.4l10-.4.2 2.5h-3.6l.4 10.8h-3L45.8 29zM62 39L59 34.5h-1.6l.2 5h-3l-.5-13.2L59 26c3 0 5.2.8 5.3 4 0 1.8-.8 3-2.2 3.8l3.3 5.2H62zm-4.5-6.8H59c1.6-.2 2.4-.8 2.3-2 0-1.4-1-1.8-2.5-1.8h-1.5l.2 3.8zM69 34.2l-4.3-8.4H68l1.2 3 1.2 2.8c.4-1 .8-2 1-3l1.2-3 3-.2L72 34l.2 4.7h-3l-.2-4.5zM79.5 25.3h3.2l1.8 6 1.2 4.2c.5-1.5.7-2.8 1-4.3L88 25h3L87.7 38H84l-4.5-13zM92.4 25l8.3-.4V27l-5.2.3V30l4.6-.3.2 2.5-4.5.2v3l5.6-.2v2.5L93 38l-.6-13zM111 37.4l-2.6-4.7h-1.6l.2 5h-3l-.5-13.2 4.8-.2c2.8 0 5 .8 5.2 4 0 1.8-.8 3-2.2 3.8l3.2 5.3H111zm-4.3-7h1.5c1.6 0 2.4-.7 2.3-2 0-1.3-1-1.7-2.5-1.7h-1.5l.2 3.8zM116.8 33.5c1 .8 2.2 1.3 3.3 1.3 1.3 0 2-.5 2-1.3s-1-1-2-1.5l-1.8-.7c-1.4-.5-2.7-1.6-2.8-3.5 0-2.2 1.8-4 4.6-4 1.5-.2 3 .4 4.3 1.5l-1.4 2c-1-.7-1.8-1-3-1-1 0-1.6.4-1.5 1.2 0 .8 1 1 2 1.5l1.8.6c1.6.6 2.7 1.6 2.7 3.5 0 2.3-1.7 4.2-4.8 4.4-1.7 0-3.6-.5-5-1.7l1.6-2.2zM126.8 23.7h3l.5 13-3 .2-.5-13.3zM132.5 30c0-4.3 2.2-7 5.8-7 3.6 0 6 2.3 6.2 6.6 0 4.3-2.2 7-5.8 7-3.5.3-6-2.3-6.2-6.6zm9-.3c-.2-2.6-1.4-4.2-3.2-4-1.8 0-3 1.6-2.8 4.2 0 2.5 1.3 4.2 3 4 2 0 3-1.6 3-4.3zM146.7 23h3l3.8 6.3 1.4 3c-.2-1.5-.5-3.3-.5-5l-.2-4.6h2.8l.6 13-3 .2-3.8-6.6-1.4-2.8c0 1.5.4 3.2.4 4.8l.2 4.7-3 .2-.3-13.2z"/>
+            <path fill="#1A1E23" d="M50.2 84.7c3.2-3.2 5.4-5.5 5.3-7.3 0-1.3-.8-2-2-2-.8 0-1.5.8-2 1.5l-1.8-1.6c1.2-1.4 2.4-2 4.2-2.2 2.4 0 4.2 1.5 4.3 4 0 2-2 4.4-4 6.7.7-.2 1.6-.3 2.2-.3H59l.2 2.4-9 .4v-1.7zM63 82.4c1 0 2 .7 2 1.8 0 1-.7 2-1.7 2s-1.8-.8-2-2c0-1 .7-1.8 1.8-1.8zM66.7 79.3c-.2-4.4 1.6-6.7 4.4-6.8 3 0 4.8 2 5 6.5s-1.7 6.8-4.5 7c-2.7 0-4.6-2.3-4.8-6.7zM73 79c0-3.4-.8-4.2-1.8-4-1 0-1.8.7-1.6 4.3 0 3.5 1 4.4 2 4.3 1 0 1.6-1 1.5-4.5zM79.8 81.8c1 0 1.8.7 2 1.8 0 1-.8 2-1.8 2s-1.8-.8-2-2c0-1 .8-1.7 1.8-1.8zM83.5 78.7C83.3 74.3 85 72 88 72c2.7-.2 4.6 2 4.7 6.4s-1.6 6.8-4.4 7c-2.8 0-4.7-2.3-4.8-6.7zm6.3-.2c0-3.5-1-4.3-2-4.2-1 0-1.7.8-1.5 4.4 0 3.5 1 4.4 2 4.3 1 0 1.7-1 1.5-4.5zM105.5 81.3h-4l-.7 3.3h-3l3.7-13.2h3.6l4.7 13h-3.2l-1-3zm-.7-2.3l-.4-1.2-1.2-4.2-1 4.3-.3 1h2.8zM110.5 71h3l.4 10.7 5-.2.2 2.5-8.2.3-.5-13.2zM121 70.7l4.7-.2c3 0 5.2 1 5.3 4 0 3.2-2.2 4.7-5 4.7h-1.8l.2 4.6h-3l-.5-13zm4.7 6.2c1.6-.2 2.4-1 2.4-2.3 0-1.4-.8-2-2.4-1.8H124v4h1.7zM133 70.3h3l.3 5 4.5-.2-.2-5h3l.5 13-3 .2v-5.5l-4.6.2.2 5.4h-3l-.5-13zM153.3 79.7h-4l-.7 3.3h-3l3.7-13.2h3.6l4.5 13h-3.2l-1-3zm-.7-2.3l-.4-1.2L151 72l-1 4.3-.3 1.2h3z"/>
+        </symbol>
+
        <symbol id="usersurvey" viewBox="0 0 200 111">
            <title>spaCy user survey 2017</title>
            <path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@ -40,6 +40,7 @@ p
    +qs({model: 'en'}) python -m spacy download en
    +qs({model: 'de'}) python -m spacy download de
    +qs({model: 'fr'}) python -m spacy download fr
+    +qs({model: 'es'}) python -m spacy download es

 +h(2, "installation") Installation instructions

--- a/website/docs/usage/models.jade
+++ b/website/docs/usage/models.jade
@ -18,7 +18,6 @@ p
    |  skew, which might decrease your accuracy.

 +quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
-    - var examples = {en: "This is a sentence.", de: "Dies ist ein Satz.", fr: "C'est une phrase."}
    for models, lang in MODELS
        - var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
        +qs({lang: lang}) python -m spacy download #{lang}
@ -26,7 +25,7 @@ p
        +qs({lang: lang, load: "module"}, "python") import #{package.id}
        +qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
        +qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
-        +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{examples[lang]}")
+        +qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
        +qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])

 +h(2, "available") Available models
--- a/website/index.jade
+++ b/website/index.jade
@ -11,7 +11,7 @@ include _includes/_mixins
    h2.c-landing__title.o-block.u-heading-1
        | in Python

-    +landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!")
+    +landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")

    +grid.o-content
        +grid-col("third").o-card