Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-06-01 13:03:57 -05:00
commit c650bc481c
8 changed files with 57 additions and 21 deletions

View File

@ -10,6 +10,7 @@ cimport numpy as np
import cytoolz
import util
from collections import OrderedDict
import ujson
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.neural import Model, Maxout, Softmax, Affine
@ -33,6 +34,7 @@ from .gold cimport GoldParse
from .morphology cimport Morphology
from .vocab cimport Vocab
from .syntax import nonproj
from .compat import json_dumps
from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
@ -308,7 +310,7 @@ class NeuralTagger(object):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(b)
self.model.from_bytes(b)
deserialize = OrderedDict((
('vocab', lambda b: self.vocab.from_bytes(b)),
('model', lambda b: load_model(b)),
@ -317,17 +319,33 @@ class NeuralTagger(object):
return self
def to_disk(self, path, **exclude):
serialize = {
'model': lambda p: p.open('wb').write(self.model.to_bytes()),
'vocab': lambda p: self.vocab.to_disk(p)
}
serialize = OrderedDict((
('vocab', lambda p: self.vocab.to_disk(p)),
('tag_map', lambda p: p.open('w').write(json_dumps(
self.vocab.morphology.tag_map))),
('model', lambda p: p.open('wb').write(self.model.to_bytes())),
))
util.to_disk(path, serialize, exclude)
def from_disk(self, path, **exclude):
deserialize = {
'model': lambda p: self.model.from_bytes(p.open('rb').read()),
'vocab': lambda p: self.vocab.from_disk(p)
}
def load_model(p):
if self.model is True:
token_vector_width = util.env_opt('token_vector_width', 128)
self.model = self.Model(self.vocab.morphology.n_tags, token_vector_width)
self.model.from_bytes(p.open('rb').read())
def load_tag_map(p):
with p.open() as file_:
tag_map = ujson.loads(file_.read())
self.vocab.morphology = Morphology(
self.vocab.strings, tag_map=tag_map,
lemmatizer=self.vocab.morphology.lemmatizer)
deserialize = OrderedDict((
('vocab', lambda p: self.vocab.from_disk(p)),
('tag_map', load_tag_map),
('model', load_model),
))
util.from_disk(path, deserialize, exclude)
return self

View File

@ -315,7 +315,6 @@ cdef class Vocab:
getters = OrderedDict((
('strings', lambda: self.strings.to_bytes()),
('lexemes', lambda: self.lexemes_to_bytes()),
('tag_map', lambda: self.morphology.tag_map),
))
return util.to_bytes(getters, exclude)
@ -326,13 +325,9 @@ cdef class Vocab:
**exclude: Named attributes to prevent from being loaded.
RETURNS (Vocab): The `Vocab` object.
"""
def set_tag_map(tag_map):
self.morphology = Morphology(self.strings, tag_map,
self.morphology.lemmatizer)
setters = OrderedDict((
('strings', lambda b: self.strings.from_bytes(b)),
('lexemes', lambda b: self.lexemes_from_bytes(b)),
('tag_map', lambda b: set_tag_map(b))
))
return util.from_bytes(bytes_data, setters, exclude)

View File

@ -77,7 +77,8 @@
{ "id": "model", "title": "Models", "multiple": true, "options": [
{ "id": "en", "title": "English", "meta": "50MB" },
{ "id": "de", "title": "German", "meta": "645MB" },
{ "id": "fr", "title": "French", "meta": "1.33GB" }]
{ "id": "fr", "title": "French", "meta": "1.33GB" },
{ "id": "es", "title": "Spanish", "meta": "378MB"}]
}
],
@ -85,7 +86,8 @@
{ "id": "lang", "title": "Language", "options": [
{ "id": "en", "title": "English", "checked": true },
{ "id": "de", "title": "German" },
{ "id": "fr", "title": "French" }]
{ "id": "fr", "title": "French" },
{ "id": "es", "title": "Spanish" }]
},
{ "id": "load", "title": "Loading style", "options": [
{ "id": "spacy", "title": "Use spacy.load()", "checked": true, "help": "Use spaCy's built-in loader to load the model by name." },
@ -108,9 +110,19 @@
],
"fr": [
{ "id": "fr_depvec_web_lg", "lang": "French", "feats": [1, 1, 0, 1], "size": "1.33 GB", "license": "CC BY-NC" }
],
"es": [
{ "id": "es_core_web_md", "lang": "Spanish", "feats": [1, 1, 1, 1], "size": "378 MB", "license": "CC BY-SA"}
]
},
"EXAMPLE_SENTENCES": {
"en": "This is a sentence.",
"de": "Dies ist ein Satz.",
"fr": "C'est une phrase.",
"es": "Esto es una frase."
},
"ALPHA": true,
"V_CSS": "1.6",
"V_JS": "1.2",

View File

@ -107,13 +107,13 @@ mixin button(url, trusted, ...style)
height - [integer] optional height to clip code block to
mixin code(label, language, icon, height)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : "" style=height ? "height: #{height}px" : "")&attributes(attributes)
pre.c-code-block.o-block(class="lang-#{(language || DEFAULT_SYNTAX)}" class=icon ? "c-code-block--has-icon" : null style=height ? "height: #{height}px" : null)&attributes(attributes)
if label
h4.u-text-label.u-text-label--dark=label
if icon
- var classes = {'accept': 'u-color-green', 'reject': 'u-color-red'}
.c-code-block__icon(class=classes[icon] || "" class=classes[icon] ? "c-code-block__icon--border" : "")
.c-code-block__icon(class=classes[icon] || null class=classes[icon] ? "c-code-block__icon--border" : null)
+icon(icon, 18)
code.c-code-block__content

View File

@ -1,5 +1,16 @@
<svg style="position: absolute; width: 0; height: 0;" width="0" height="0" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<defs>
<symbol id="v2alpha" viewBox="0 0 200 111">
<title>spaCy v2.0.0 alpha</title>
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>
<path fill="#a3cad3" d="M45.4 105.4L19.6 94.6l25.4-1"/>
<path fill="#ddd" d="M196.6 2L155 3.4l1 29.2 41.6-1.4L187.2 17"/>
<path fill="#a3cad3" d="M155 3.4l25.8 10.8-25.4 1"/>
<path fill="#fff" d="M17.6 19.4l163-5.6 1 29.2-163 5.6zM19.2 65.6l163-5.6 1 29.2-163 5.6z"/>
<path fill="#008EBC" d="M45.8 29h-3.6v-2.4l10-.4.2 2.5h-3.6l.4 10.8h-3L45.8 29zM62 39L59 34.5h-1.6l.2 5h-3l-.5-13.2L59 26c3 0 5.2.8 5.3 4 0 1.8-.8 3-2.2 3.8l3.3 5.2H62zm-4.5-6.8H59c1.6-.2 2.4-.8 2.3-2 0-1.4-1-1.8-2.5-1.8h-1.5l.2 3.8zM69 34.2l-4.3-8.4H68l1.2 3 1.2 2.8c.4-1 .8-2 1-3l1.2-3 3-.2L72 34l.2 4.7h-3l-.2-4.5zM79.5 25.3h3.2l1.8 6 1.2 4.2c.5-1.5.7-2.8 1-4.3L88 25h3L87.7 38H84l-4.5-13zM92.4 25l8.3-.4V27l-5.2.3V30l4.6-.3.2 2.5-4.5.2v3l5.6-.2v2.5L93 38l-.6-13zM111 37.4l-2.6-4.7h-1.6l.2 5h-3l-.5-13.2 4.8-.2c2.8 0 5 .8 5.2 4 0 1.8-.8 3-2.2 3.8l3.2 5.3H111zm-4.3-7h1.5c1.6 0 2.4-.7 2.3-2 0-1.3-1-1.7-2.5-1.7h-1.5l.2 3.8zM116.8 33.5c1 .8 2.2 1.3 3.3 1.3 1.3 0 2-.5 2-1.3s-1-1-2-1.5l-1.8-.7c-1.4-.5-2.7-1.6-2.8-3.5 0-2.2 1.8-4 4.6-4 1.5-.2 3 .4 4.3 1.5l-1.4 2c-1-.7-1.8-1-3-1-1 0-1.6.4-1.5 1.2 0 .8 1 1 2 1.5l1.8.6c1.6.6 2.7 1.6 2.7 3.5 0 2.3-1.7 4.2-4.8 4.4-1.7 0-3.6-.5-5-1.7l1.6-2.2zM126.8 23.7h3l.5 13-3 .2-.5-13.3zM132.5 30c0-4.3 2.2-7 5.8-7 3.6 0 6 2.3 6.2 6.6 0 4.3-2.2 7-5.8 7-3.5.3-6-2.3-6.2-6.6zm9-.3c-.2-2.6-1.4-4.2-3.2-4-1.8 0-3 1.6-2.8 4.2 0 2.5 1.3 4.2 3 4 2 0 3-1.6 3-4.3zM146.7 23h3l3.8 6.3 1.4 3c-.2-1.5-.5-3.3-.5-5l-.2-4.6h2.8l.6 13-3 .2-3.8-6.6-1.4-2.8c0 1.5.4 3.2.4 4.8l.2 4.7-3 .2-.3-13.2z"/>
<path fill="#1A1E23" d="M50.2 84.7c3.2-3.2 5.4-5.5 5.3-7.3 0-1.3-.8-2-2-2-.8 0-1.5.8-2 1.5l-1.8-1.6c1.2-1.4 2.4-2 4.2-2.2 2.4 0 4.2 1.5 4.3 4 0 2-2 4.4-4 6.7.7-.2 1.6-.3 2.2-.3H59l.2 2.4-9 .4v-1.7zM63 82.4c1 0 2 .7 2 1.8 0 1-.7 2-1.7 2s-1.8-.8-2-2c0-1 .7-1.8 1.8-1.8zM66.7 79.3c-.2-4.4 1.6-6.7 4.4-6.8 3 0 4.8 2 5 6.5s-1.7 6.8-4.5 7c-2.7 0-4.6-2.3-4.8-6.7zM73 79c0-3.4-.8-4.2-1.8-4-1 0-1.8.7-1.6 4.3 0 3.5 1 4.4 2 4.3 1 0 1.6-1 1.5-4.5zM79.8 81.8c1 0 1.8.7 2 1.8 0 1-.8 2-1.8 2s-1.8-.8-2-2c0-1 .8-1.7 1.8-1.8zM83.5 78.7C83.3 74.3 85 72 88 72c2.7-.2 4.6 2 4.7 6.4s-1.6 6.8-4.4 7c-2.8 0-4.7-2.3-4.8-6.7zm6.3-.2c0-3.5-1-4.3-2-4.2-1 0-1.7.8-1.5 4.4 0 3.5 1 4.4 2 4.3 1 0 1.7-1 1.5-4.5zM105.5 81.3h-4l-.7 3.3h-3l3.7-13.2h3.6l4.7 13h-3.2l-1-3zm-.7-2.3l-.4-1.2-1.2-4.2-1 4.3-.3 1h2.8zM110.5 71h3l.4 10.7 5-.2.2 2.5-8.2.3-.5-13.2zM121 70.7l4.7-.2c3 0 5.2 1 5.3 4 0 3.2-2.2 4.7-5 4.7h-1.8l.2 4.6h-3l-.5-13zm4.7 6.2c1.6-.2 2.4-1 2.4-2.3 0-1.4-.8-2-2.4-1.8H124v4h1.7zM133 70.3h3l.3 5 4.5-.2-.2-5h3l.5 13-3 .2v-5.5l-4.6.2.2 5.4h-3l-.5-13zM153.3 79.7h-4l-.7 3.3h-3l3.7-13.2h3.6l4.5 13h-3.2l-1-3zm-.7-2.3l-.4-1.2L151 72l-1 4.3-.3 1.2h3z"/>
</symbol>
<symbol id="usersurvey" viewBox="0 0 200 111">
<title>spaCy user survey 2017</title>
<path fill="#ddd" d="M183.3 89.2l-164.6-40-1-29.2 164.6 40M3.8 106.8l41.6-1.4-1-29.2-41.6 1.4L13.2 92"/>

Before

Width:  |  Height:  |  Size: 18 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@ -40,6 +40,7 @@ p
+qs({model: 'en'}) python -m spacy download en
+qs({model: 'de'}) python -m spacy download de
+qs({model: 'fr'}) python -m spacy download fr
+qs({model: 'es'}) python -m spacy download es
+h(2, "installation") Installation instructions

View File

@ -18,7 +18,6 @@ p
| skew, which might decrease your accuracy.
+quickstart(QUICKSTART_MODELS, "Quickstart", "Install a default model, get the code to load it from within spaCy and an example to test it. For more options, see the section on available models below.")
- var examples = {en: "This is a sentence.", de: "Dies ist ein Satz.", fr: "C'est une phrase."}
for models, lang in MODELS
- var package = (models.length == 1) ? models[0] : models.find(function(m) { return m.def })
+qs({lang: lang}) python -m spacy download #{lang}
@ -26,7 +25,7 @@ p
+qs({lang: lang, load: "module"}, "python") import #{package.id}
+qs({lang: lang, load: "module"}, "python") nlp = #{package.id}.load()
+qs({lang: lang, load: "spacy"}, "python") nlp = spacy.load('#{lang}')
+qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{examples[lang]}")
+qs({lang: lang, config: "example"}, "python") doc = nlp(u"#{EXAMPLE_SENTENCES[lang]}")
+qs({lang: lang, config: "example"}, "python") print([(w.text, w.pos_) for w in doc])
+h(2, "available") Available models

View File

@ -11,7 +11,7 @@ include _includes/_mixins
h2.c-landing__title.o-block.u-heading-1
| in Python
+landing-badge("https://survey.spacy.io", "usersurvey", "Take the user survey!")
+landing-badge(gh("spaCy") + "/releases/tag/v2.0.0-alpha", "v2alpha", "Try spaCy v2.0.0 alpha!")
+grid.o-content
+grid-col("third").o-card