Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
ines 2017-11-01 16:49:44 +01:00
commit 1d1f91a041
3 changed files with 51 additions and 9 deletions

View File

@ -26,8 +26,9 @@ from spacy.pipeline import TextCategorizer
@plac.annotations( @plac.annotations(
model=("Model name. Defaults to blank 'en' model.", "option", "m", str), model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
output_dir=("Optional output directory", "option", "o", Path), output_dir=("Optional output directory", "option", "o", Path),
n_examples=("Number of texts to train from", "option", "N", int),
n_iter=("Number of training iterations", "option", "n", int)) n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=20): def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
if model is not None: if model is not None:
nlp = spacy.load(model) # load existing spaCy model nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model) print("Loaded model '%s'" % model)
@ -50,7 +51,8 @@ def main(model=None, output_dir=None, n_iter=20):
# load the IMBD dataset # load the IMBD dataset
print("Loading IMDB data...") print("Loading IMDB data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=2000) print("Using %d training examples" % n_texts)
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
train_docs = [nlp.tokenizer(text) for text in train_texts] train_docs = [nlp.tokenizer(text) for text in train_texts]
train_gold = [GoldParse(doc, cats=cats) for doc, cats in train_gold = [GoldParse(doc, cats=cats) for doc, cats in
zip(train_docs, train_cats)] zip(train_docs, train_cats)]
@ -65,14 +67,14 @@ def main(model=None, output_dir=None, n_iter=20):
for i in range(n_iter): for i in range(n_iter):
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(train_data, size=compounding(4., 128., 1.001)) batches = minibatch(train_data, size=compounding(4., 32., 1.001))
for batch in batches: for batch in batches:
docs, golds = zip(*batch) docs, golds = zip(*batch)
nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses) nlp.update(docs, golds, sgd=optimizer, drop=0.2, losses=losses)
with textcat.model.use_params(optimizer.averages): with textcat.model.use_params(optimizer.averages):
# evaluate on the dev data split off in load_data() # evaluate on the dev data split off in load_data()
scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
print('{0:.3f}\t{0:.3f}\t{0:.3f}\t{0:.3f}' # print a simple table print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table
.format(losses['textcat'], scores['textcat_p'], .format(losses['textcat'], scores['textcat_p'],
scores['textcat_r'], scores['textcat_f'])) scores['textcat_r'], scores['textcat_f']))

View File

@ -434,7 +434,7 @@ def build_text_classifier(nr_class, width=64, **cfg):
pretrained_dims = cfg.get('pretrained_dims', 0) pretrained_dims = cfg.get('pretrained_dims', 0)
with Model.define_operators({'>>': chain, '+': add, '|': concatenate, with Model.define_operators({'>>': chain, '+': add, '|': concatenate,
'**': clone}): '**': clone}):
if cfg.get('low_data'): if cfg.get('low_data') and pretrained_dims:
model = ( model = (
SpacyVectors SpacyVectors
>> flatten_add_lengths >> flatten_add_lengths

View File

@ -11,9 +11,9 @@ import ujson
import msgpack import msgpack
from thinc.api import chain from thinc.api import chain
from thinc.v2v import Softmax from thinc.v2v import Affine, Softmax
from thinc.t2v import Pooling, max_pool, mean_pool from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural.util import to_categorical from thinc.neural.util import to_categorical, copy_array
from thinc.neural._classes.difference import Siamese, CauchySimilarity from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
@ -130,6 +130,15 @@ class Pipe(object):
documents and their predicted scores.""" documents and their predicted scores."""
raise NotImplementedError raise NotImplementedError
def add_label(self, label):
"""Add an output label, to be predicted by the model.
It's possible to extend pre-trained models with new labels,
but care should be taken to avoid the "catastrophic forgetting"
problem.
"""
raise NotImplementedError
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
"""Initialize the pipe for training, using data exampes if available. """Initialize the pipe for training, using data exampes if available.
If no model has been initialized yet, the model is added.""" If no model has been initialized yet, the model is added."""
@ -325,6 +334,14 @@ class Tagger(Pipe):
self.cfg.setdefault('pretrained_dims', self.cfg.setdefault('pretrained_dims',
self.vocab.vectors.data.shape[1]) self.vocab.vectors.data.shape[1])
@property
def labels(self):
return self.cfg.setdefault('tag_names', [])
@labels.setter
def labels(self, value):
self.cfg['tag_names'] = value
def __call__(self, doc): def __call__(self, doc):
tags = self.predict([doc]) tags = self.predict([doc])
self.set_annotations([doc], tags) self.set_annotations([doc], tags)
@ -352,6 +369,7 @@ class Tagger(Pipe):
cdef Doc doc cdef Doc doc
cdef int idx = 0 cdef int idx = 0
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
tags = list(self.labels)
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, 'get'): if hasattr(doc_tag_ids, 'get'):
@ -359,7 +377,7 @@ class Tagger(Pipe):
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags # Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0: if doc.c[j].tag == 0 and doc.c[j].pos == 0:
vocab.morphology.assign_tag_id(&doc.c[j], tag_id) vocab.morphology.assign_tag(&doc.c[j], tags[tag_id])
idx += 1 idx += 1
doc.is_tagged = True doc.is_tagged = True
@ -420,6 +438,17 @@ class Tagger(Pipe):
def Model(cls, n_tags, **cfg): def Model(cls, n_tags, **cfg):
return build_tagger_model(n_tags, **cfg) return build_tagger_model(n_tags, **cfg)
def add_label(self, label):
if label in self.labels:
return 0
smaller = self.model[-1]._layers[-1]
larger = Softmax(len(self.labels)+1, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger
self.labels.append(label)
return 1
def use_params(self, params): def use_params(self, params):
with self.model.use_params(params): with self.model.use_params(params):
yield yield
@ -675,7 +704,7 @@ class TextCategorizer(Pipe):
@property @property
def labels(self): def labels(self):
return self.cfg.get('labels', ['LABEL']) return self.cfg.setdefault('labels', ['LABEL'])
@labels.setter @labels.setter
def labels(self, value): def labels(self, value):
@ -727,6 +756,17 @@ class TextCategorizer(Pipe):
mean_square_error = ((scores-truths)**2).sum(axis=1).mean() mean_square_error = ((scores-truths)**2).sum(axis=1).mean()
return mean_square_error, d_scores return mean_square_error, d_scores
def add_label(self, label):
if label in self.labels:
return 0
smaller = self.model[-1]._layers[-1]
larger = Affine(len(self.labels)+1, smaller.nI)
copy_array(larger.W[:smaller.nO], smaller.W)
copy_array(larger.b[:smaller.nO], smaller.b)
self.model[-1]._layers[-1] = larger
self.labels.append(label)
return 1
def begin_training(self, gold_tuples=tuple(), pipeline=None): def begin_training(self, gold_tuples=tuple(), pipeline=None):
if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer': if pipeline and getattr(pipeline[0], 'name', None) == 'tensorizer':
token_vector_width = pipeline[0].model.nO token_vector_width = pipeline[0].model.nO