Merge __init__

This commit is contained in:
Matthew Honnibal 2019-03-08 16:58:42 +01:00
commit 8a9181d95a
5 changed files with 62 additions and 58 deletions

View File

@ -465,17 +465,16 @@ def getitem(i):
@describe.attributes(
W=Synapses("Weights matrix",
lambda obj: (obj.nO, obj.nI),
lambda W, ops: None)
W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
)
class MultiSoftmax(Affine):
'''Neural network layer that predicts several multi-class attributes at once.
"""Neural network layer that predicts several multi-class attributes at once.
For instance, we might predict one class with 6 variables, and another with 5.
We predict the 11 neurons required for this, and then softmax them such
that columns 0-6 make a probability distribution and coumns 6-11 make another.
'''
name = 'multisoftmax'
"""
name = "multisoftmax"
def __init__(self, out_sizes, nI=None, **kwargs):
Model.__init__(self, **kwargs)
@ -487,12 +486,13 @@ class MultiSoftmax(Affine):
output__BO = self.ops.affine(self.W, self.b, input__BI)
i = 0
for out_size in self.out_sizes:
self.ops.softmax(output__BO[:, i : i+out_size], inplace=True)
self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
i += out_size
return output__BO
def begin_update(self, input__BI, drop=0.):
def begin_update(self, input__BI, drop=0.0):
output__BO = self.predict(input__BI)
def finish_update(grad__BO, sgd=None):
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
self.d_b += grad__BO.sum(axis=0)
@ -500,6 +500,7 @@ class MultiSoftmax(Affine):
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return grad__BI
return output__BO, finish_update
@ -515,41 +516,41 @@ def build_tagger_model(nr_class, **cfg):
if "tok2vec" in cfg:
tok2vec = cfg["tok2vec"]
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors)
softmax = with_flatten(
Softmax(nr_class, token_vector_width))
model = (
tok2vec
>> softmax
)
tok2vec = Tok2Vec(
token_vector_width,
embed_size,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors,
)
softmax = with_flatten(Softmax(nr_class, token_vector_width))
model = tok2vec >> softmax
model.nI = None
model.tok2vec = tok2vec
model.softmax = softmax
return model
def build_morphologizer_model(class_nums, **cfg):
embed_size = util.env_opt('embed_size', 7000)
if 'token_vector_width' in cfg:
token_vector_width = cfg['token_vector_width']
embed_size = util.env_opt("embed_size", 7000)
if "token_vector_width" in cfg:
token_vector_width = cfg["token_vector_width"]
else:
token_vector_width = util.env_opt('token_vector_width', 128)
pretrained_vectors = cfg.get('pretrained_vectors')
subword_features = cfg.get('subword_features', True)
with Model.define_operators({'>>': chain, '+': add}):
if 'tok2vec' in cfg:
tok2vec = cfg['tok2vec']
token_vector_width = util.env_opt("token_vector_width", 128)
pretrained_vectors = cfg.get("pretrained_vectors")
subword_features = cfg.get("subword_features", True)
with Model.define_operators({">>": chain, "+": add}):
if "tok2vec" in cfg:
tok2vec = cfg["tok2vec"]
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors)
tok2vec = Tok2Vec(
token_vector_width,
embed_size,
subword_features=subword_features,
pretrained_vectors=pretrained_vectors,
)
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
softmax.out_sizes = class_nums
model = (
tok2vec
>> softmax
)
model = tok2vec >> softmax
model.nI = None
model.tok2vec = tok2vec
model.softmax = softmax
@ -630,17 +631,13 @@ def build_text_classifier(nr_class, width=64, **cfg):
)
linear_model = _preprocess_doc >> LinearModel(nr_class)
if cfg.get('exclusive_classes'):
if cfg.get("exclusive_classes"):
output_layer = Softmax(nr_class, nr_class * 2)
else:
output_layer = (
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0))
>> logistic
zero_init(Affine(nr_class, nr_class * 2, drop_factor=0.0)) >> logistic
)
model = (
(linear_model | cnn_model)
>> output_layer
)
model = (linear_model | cnn_model) >> output_layer
model.tok2vec = chain(tok2vec, flatten)
model.nO = nr_class
model.lsuv = False
@ -658,7 +655,9 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False,
if exclusive_classes:
output_layer = Softmax(nr_class, tok2vec.nO)
else:
output_layer = zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
output_layer = (
zero_init(Affine(nr_class, tok2vec.nO, drop_factor=0.0)) >> logistic
)
model = tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> output_layer
model.tok2vec = chain(tok2vec, flatten)
model.nO = nr_class

View File

@ -350,7 +350,6 @@ class Errors(object):
"is likely a bug in spaCy.")
@add_codes
class TempErrors(object):
T003 = ("Resizing pre-trained Tagger models is not currently supported.")

View File

@ -14,6 +14,7 @@ __all__ = [
"TextCategorizer",
"Tensorizer",
"Pipe",
"Morphologizer",
"EntityRuler",
"SentenceSegmenter",
"SimilarityHook",

View File

@ -2,11 +2,7 @@
from __future__ import unicode_literals
import pytest
import numpy
from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
from spacy.symbols import VERB
from spacy.vocab import Vocab
from spacy.tokens import Doc
@pytest.fixture
def i_has(en_tokenizer):
@ -15,11 +11,13 @@ def i_has(en_tokenizer):
doc[1].tag_ = "VBZ"
return doc
def test_token_morph_id(i_has):
assert i_has[0].morph.id
assert i_has[1].morph.id != 0
assert i_has[0].morph.id != i_has[1].morph.id
def test_morph_props(i_has):
assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
assert i_has[0].morph.pron_type_ == "PronType_prs"

View File

@ -1,41 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from ...morphology import Morphology
from ...strings import StringStore, get_string_id
from ...lemmatizer import Lemmatizer
from ...morphology import *
import pytest
from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id
from spacy.lemmatizer import Lemmatizer
@pytest.fixture
def morphology():
return Morphology(StringStore(), {}, Lemmatizer())
def test_init(morphology):
pass
def test_add_morphology_with_string_names(morphology):
morphology.add({"Case_gen", "Number_sing"})
def test_add_morphology_with_int_ids(morphology):
morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
def test_add_morphology_with_mix_strings_and_ints(morphology):
morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'})
morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"})
def test_morphology_tags_hash_distinctly(morphology):
tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'})
tag2 = morphology.add({"Case_gen", 'Number_sing'})
tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"})
tag2 = morphology.add({"Case_gen", "Number_sing"})
assert tag1 != tag2
def test_morphology_tags_hash_independent_of_order(morphology):
tag1 = morphology.add({"Case_gen", 'Number_sing'})
tag2 = morphology.add({"Number_sing", "Case_gen"})
tag1 = morphology.add({"Case_gen", "Number_sing"})
tag2 = morphology.add({"Number_sing", "Case_gen"})
assert tag1 == tag2
def test_update_morphology_tag(morphology):
tag1 = morphology.add({"Case_gen"})
tag2 = morphology.update(tag1, {"Number_sing"})
assert tag1 != tag2
tag3 = morphology.add({"Number_sing", "Case_gen"})
tag3 = morphology.add({"Number_sing", "Case_gen"})
assert tag2 == tag3