mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
1d1f91a041
|
@ -26,8 +26,9 @@ from spacy.pipeline import TextCategorizer
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
|
||||||
output_dir=("Optional output directory", "option", "o", Path),
|
output_dir=("Optional output directory", "option", "o", Path),
|
||||||
|
n_examples=("Number of texts to train from", "option", "N", int),
|
||||||
n_iter=("Number of training iterations", "option", "n", int))
|
n_iter=("Number of training iterations", "option", "n", int))
|
||||||
def main(model=None, output_dir=None, n_iter=20):
|
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
|
||||||
if model is not None:
|
if model is not None:
|
||||||
nlp = spacy.load(model) # load existing spaCy model
|
nlp = spacy.load(model) # load existing spaCy model
|
||||||
print("Loaded model '%s'" % model)
|
print("Loaded model '%s'" % model)
|
||||||
|
@ -50,7 +51,8 @@ def main(model=None, output_dir=None, n_iter=20):
|
||||||
|
|
||||||
# load the IMBD dataset
|
# load the IMBD dataset
|
||||||
print("Loading IMDB data...")
|
print("Loading IMDB data...")
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000)
|
print("Using %d training examples" % n_texts)
|
||||||
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
|
||||||
train_docs = [nlp.tokenizer(text) for text in train_texts]
|
train_docs = [nlp.tokenizer(text) for text in train_texts]
|
||||||
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
train_gold = [GoldParse(doc, cats=cats) for doc, cats in
|
||||||
zip(train_docs, train_cats)]
|
zip(train_docs, train_cats)]
|
||||||
|
@ -65,14 +67,14 @@ def main(model=None, output_dir=None, n_iter=20):
|
||||||
for i in range(n_iter):
|
for i in range(n_iter):
|
||||||
losses = {}
|
losses = {}
|
||||||
# batch up the examples using spaCy's minibatch
|
# batch up the examples using spaCy's minibatch
|
||||||
batches = minibatch(train_data, size=compounding(4., 128., 1.001))
|
batches = minibatch(train_data, size=compounding(4., 32., 1.001))
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
docs, golds = zip(*batch)
|
docs, golds = zip(*batch)
|
||||||
nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
|
nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
|
||||||
with textcat.model.use_params(optimizer.averages):
|
with textcat.model.use_params(optimizer.averages):
|
||||||
# evaluate on the dev data split off in load_data()
|
# evaluate on the dev data split off in load_data()
|
||||||
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
|
||||||
print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}' # print a simple table
|
print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
|
||||||
.format(losses['textcat'], scores['textcat_p'],
|
.format(losses['textcat'], scores['textcat_p'],
|
||||||
scores['textcat_r'], scores['textcat_f']))
|
scores['textcat_r'], scores['textcat_f']))
|
||||||
|
|
||||||
|
|
|
@ -434,7 +434,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
|
||||||
pretrained_dims = cfg.get('pretrained_dims', 0)
|
pretrained_dims = cfg.get('pretrained_dims', 0)
|
||||||
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
|
||||||
'**': clone}):
|
'**': clone}):
|
||||||
if cfg.get('low_data'):
|
if cfg.get('low_data') and pretrained_dims:
|
||||||
model = (
|
model = (
|
||||||
SpacyVectors
|
SpacyVectors
|
||||||
>> flatten_add_lengths
|
>> flatten_add_lengths
|
||||||
|
|
|
@ -11,9 +11,9 @@ import ujson
|
||||||
import msgpack
|
import msgpack
|
||||||
|
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Softmax
|
from thinc.v2v import Affine, Softmax
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
from thinc.neural.util import to_categorical
|
from thinc.neural.util import to_categorical, copy_array
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
@ -130,6 +130,15 @@ class Pipe(object):
|
||||||
documents and their predicted scores."""
|
documents and their predicted scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
"""Add an output label, to be predicted by the model.
|
||||||
|
|
||||||
|
It's possible to extend pre-trained models with new labels,
|
||||||
|
but care should be taken to avoid the "catastrophic forgetting"
|
||||||
|
problem.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
"""Initialize the pipe for training, using data exampes if available.
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
If no model has been initialized yet, the model is added."""
|
If no model has been initialized yet, the model is added."""
|
||||||
|
@ -325,6 +334,14 @@ class Tagger(Pipe):
|
||||||
self.cfg.setdefault('pretrained_dims',
|
self.cfg.setdefault('pretrained_dims',
|
||||||
self.vocab.vectors.data.shape[1])
|
self.vocab.vectors.data.shape[1])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self):
|
||||||
|
return self.cfg.setdefault('tag_names', [])
|
||||||
|
|
||||||
|
@labels.setter
|
||||||
|
def labels(self, value):
|
||||||
|
self.cfg['tag_names'] = value
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
tags = self.predict([doc])
|
tags = self.predict([doc])
|
||||||
self.set_annotations([doc], tags)
|
self.set_annotations([doc], tags)
|
||||||
|
@ -352,6 +369,7 @@ class Tagger(Pipe):
|
||||||
cdef Doc doc
|
cdef Doc doc
|
||||||
cdef int idx = 0
|
cdef int idx = 0
|
||||||
cdef Vocab vocab = self.vocab
|
cdef Vocab vocab = self.vocab
|
||||||
|
tags = list(self.labels)
|
||||||
for i, doc in enumerate(docs):
|
for i, doc in enumerate(docs):
|
||||||
doc_tag_ids = batch_tag_ids[i]
|
doc_tag_ids = batch_tag_ids[i]
|
||||||
if hasattr(doc_tag_ids, 'get'):
|
if hasattr(doc_tag_ids, 'get'):
|
||||||
|
@ -359,7 +377,7 @@ class Tagger(Pipe):
|
||||||
for j, tag_id in enumerate(doc_tag_ids):
|
for j, tag_id in enumerate(doc_tag_ids):
|
||||||
# Don't clobber preset POS tags
|
# Don't clobber preset POS tags
|
||||||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
vocab.morphology.assign_tag(&doc.c[j], tags[tag_id])
|
||||||
idx += 1
|
idx += 1
|
||||||
doc.is_tagged = True
|
doc.is_tagged = True
|
||||||
|
|
||||||
|
@ -420,6 +438,17 @@ class Tagger(Pipe):
|
||||||
def Model(cls, n_tags, **cfg):
|
def Model(cls, n_tags, **cfg):
|
||||||
return build_tagger_model(n_tags, **cfg)
|
return build_tagger_model(n_tags, **cfg)
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
if label in self.labels:
|
||||||
|
return 0
|
||||||
|
smaller = self.model[-1]._layers[-1]
|
||||||
|
larger = Softmax(len(self.labels)+1, smaller.nI)
|
||||||
|
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
|
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
|
self.model[-1]._layers[-1] = larger
|
||||||
|
self.labels.append(label)
|
||||||
|
return 1
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
@ -675,7 +704,7 @@ class TextCategorizer(Pipe):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def labels(self):
|
def labels(self):
|
||||||
return self.cfg.get('labels', ['LABEL'])
|
return self.cfg.setdefault('labels', ['LABEL'])
|
||||||
|
|
||||||
@labels.setter
|
@labels.setter
|
||||||
def labels(self, value):
|
def labels(self, value):
|
||||||
|
@ -727,6 +756,17 @@ class TextCategorizer(Pipe):
|
||||||
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
|
mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
|
||||||
return mean_square_error, d_scores
|
return mean_square_error, d_scores
|
||||||
|
|
||||||
|
def add_label(self, label):
|
||||||
|
if label in self.labels:
|
||||||
|
return 0
|
||||||
|
smaller = self.model[-1]._layers[-1]
|
||||||
|
larger = Affine(len(self.labels)+1, smaller.nI)
|
||||||
|
copy_array(larger.W[:smaller.nO], smaller.W)
|
||||||
|
copy_array(larger.b[:smaller.nO], smaller.b)
|
||||||
|
self.model[-1]._layers[-1] = larger
|
||||||
|
self.labels.append(label)
|
||||||
|
return 1
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||||
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
|
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
|
||||||
token_vector_width = pipeline[0].model.nO
|
token_vector_width = pipeline[0].model.nO
|
||||||
|
|
Loading…
Reference in New Issue