diff --git a/examples/paddle/sentiment_bilstm/config.py b/examples/paddle/sentiment_bilstm/config.py deleted file mode 100644 index 311359f2c..000000000 --- a/examples/paddle/sentiment_bilstm/config.py +++ /dev/null @@ -1,14 +0,0 @@ -from paddle.trainer_config_helpers import * - -define_py_data_sources2(train_list='train.list', - test_list='test.list', - module="dataprovider", - obj="process") - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25 -) diff --git a/examples/paddle/sentiment_bilstm/dataprovider.py b/examples/paddle/sentiment_bilstm/dataprovider.py deleted file mode 100644 index d4fb57756..000000000 --- a/examples/paddle/sentiment_bilstm/dataprovider.py +++ /dev/null @@ -1,46 +0,0 @@ -from paddle.trainer.PyDataProvider2 import * -from itertools import izip -import spacy - - -def get_features(doc): - return numpy.asarray( - [t.rank+1 for t in doc - if t.has_vector and not t.is_punct and not t.is_space], - dtype='int32') - - -def read_data(data_dir): - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - with filename.open() as file_: - text = file_.read() - yield text, label - - -def on_init(settings, **kwargs): - print("Loading spaCy") - nlp = spacy.load('en', entity=False) - vectors = get_vectors(nlp) - settings.input_types = [ - # The text is a sequence of integer values, and each value is a word id. - # The whole sequence is the sentences that we want to predict its - # sentimental. - integer_value(vectors.shape[0], seq_type=SequenceType), # text input - - # label positive/negative - integer_value(2) - ] - settings.nlp = nlp - settings.vectors = vectors - settings['batch_size'] = 32 - - -@provider(init_hook=on_init) -def process(settings, data_dir): # settings is not used currently. - texts, labels = read_data(data_dir) - for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels): - for sent in doc.sents: - ids = get_features(sent) - # give data to paddle. - yield ids, label diff --git a/examples/paddle/sentiment_bilstm/networks.py b/examples/paddle/sentiment_bilstm/networks.py deleted file mode 100644 index 84e9732c0..000000000 --- a/examples/paddle/sentiment_bilstm/networks.py +++ /dev/null @@ -1,19 +0,0 @@ -from paddle.trainer_config_helpers import * - - -def bidirectional_lstm_net(input_dim, - class_dim=2, - emb_dim=128, - lstm_dim=128, - is_predict=False): - data = data_layer("word", input_dim) - emb = embedding_layer(input=data, size=emb_dim) - bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim) - dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) - output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation()) - - if not is_predict: - lbl = data_layer("label", 1) - outputs(classification_cost(input=output, label=lbl)) - else: - outputs(output) diff --git a/examples/paddle/sentiment_bilstm/train.sh b/examples/paddle/sentiment_bilstm/train.sh deleted file mode 100755 index ffc6dd4bd..000000000 --- a/examples/paddle/sentiment_bilstm/train.sh +++ /dev/null @@ -1,14 +0,0 @@ -config=config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=20 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - --config_args=batch_size=100 \ - 2>&1 | tee 'train.log'_ diff --git a/examples/sentiment/__init__.py b/examples/sentiment/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/sentiment/main.py b/examples/sentiment/main.py deleted file mode 100644 index 7833a9ac3..000000000 --- a/examples/sentiment/main.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals -from __future__ import print_function - -import plac -from pathlib import Path -import random - -import spacy.en -import model - - -try: - import cPickle as pickle -except ImportError: - import pickle - - -def read_data(nlp, data_dir): - for subdir, label in (('pos', 1), ('neg', 0)): - for filename in (data_dir / subdir).iterdir(): - text = filename.open().read() - doc = nlp(text) - yield doc, label - - -def partition(examples, split_size): - examples = list(examples) - random.shuffle(examples) - n_docs = len(examples) - split = int(n_docs * split_size) - return examples[:split], examples[split:] - - -class Dataset(object): - def __init__(self, nlp, data_dir, batch_size=24): - self.batch_size = batch_size - self.train, self.dev = partition(read_data(nlp, Path(data_dir)), 0.8) - print("Read %d train docs" % len(self.train)) - print("Pos. Train: ", sum(eg[1] == 1 for eg in self.train)) - print("Read %d dev docs" % len(self.dev)) - print("Neg. Dev: ", sum(eg[1] == 1 for eg in self.dev)) - - def batches(self, data): - for i in range(0, len(data), self.batch_size): - yield data[i : i + self.batch_size] - - -def model_writer(out_dir, name): - def save_model(epoch, params): - out_path = out_dir / name.format(epoch=epoch) - pickle.dump(params, out_path.open('wb')) - return save_model - - -@plac.annotations( - data_dir=("Data directory", "positional", None, Path), - vocab_size=("Number of words to fine-tune", "option", "w", int), - n_iter=("Number of iterations (epochs)", "option", "i", int), - vector_len=("Size of embedding vectors", "option", "e", int), - hidden_len=("Size of hidden layers", "option", "H", int), - depth=("Depth", "option", "d", int), - drop_rate=("Drop-out rate", "option", "r", float), - rho=("Regularization penalty", "option", "p", float), - batch_size=("Batch size", "option", "b", int), - out_dir=("Model directory", "positional", None, Path) -) -def main(data_dir, out_dir, n_iter=10, vector_len=300, vocab_size=20000, - hidden_len=300, depth=3, drop_rate=0.3, rho=1e-4, batch_size=24): - print("Loading") - nlp = spacy.en.English(parser=False) - dataset = Dataset(nlp, data_dir / 'train', batch_size) - print("Training") - network = model.train(dataset, vector_len, hidden_len, 2, vocab_size, depth, - drop_rate, rho, n_iter, - model_writer(out_dir, 'model_{epoch}.pickle')) - score = model.Scorer() - print("Evaluating") - for doc, label in read_data(nlp, data_dir / 'test'): - word_ids, embeddings = model.get_words(doc, 0.0, vocab_size) - guess = network.forward(word_ids, embeddings) - score += guess == label - print(score) - - -if __name__ == '__main__': - plac.call(main) diff --git a/examples/sentiment/model.py b/examples/sentiment/model.py deleted file mode 100644 index 961f511cd..000000000 --- a/examples/sentiment/model.py +++ /dev/null @@ -1,188 +0,0 @@ -from __future__ import division -from numpy import average, zeros, outer, random, exp, sqrt, concatenate, argmax -import numpy - -from .util import Scorer - - -class Adagrad(object): - def __init__(self, dim, lr): - self.dim = dim - self.eps = 1e-3 - # initial learning rate - self.learning_rate = lr - # stores sum of squared gradients - self.h = zeros(self.dim) - self._curr_rate = zeros(self.h.shape) - - def rescale(self, gradient): - self._curr_rate.fill(0) - self.h += gradient ** 2 - self._curr_rate = self.learning_rate / (sqrt(self.h) + self.eps) - return self._curr_rate * gradient - - def reset_weights(self): - self.h = zeros(self.dim) - - -class Params(object): - @classmethod - def zero(cls, depth, n_embed, n_hidden, n_labels, n_vocab): - return cls(depth, n_embed, n_hidden, n_labels, n_vocab, lambda x: zeros((x,))) - - @classmethod - def random(cls, depth, nE, nH, nL, nV): - return cls(depth, nE, nH, nL, nV, lambda x: (random.rand(x) * 2 - 1) * 0.08) - - def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, initializer): - nE = n_embed; nH = n_hidden; nL = n_labels; nV = n_vocab - n_weights = sum([ - (nE * nH) + nH, - (nH * nH + nH) * depth, - (nH * nL) + nL, - (nV * nE) - ]) - self.data = initializer(n_weights) - self.W = [] - self.b = [] - i = self._add_layer(0, nE, nH) - for _ in range(1, depth): - i = self._add_layer(i, nH, nH) - i = self._add_layer(i, nL, nH) - self.E = self.data[i : i + (nV * nE)].reshape((nV, nE)) - self.E.fill(0) - - def _add_layer(self, start, x, y): - end = start + (x * y) - self.W.append(self.data[start : end].reshape((x, y))) - self.b.append(self.data[end : end + x].reshape((x, ))) - return end + x - - -def softmax(actvn, W, b): - w = W.dot(actvn) + b - ew = exp(w - max(w)) - return (ew / sum(ew)).ravel() - - -def relu(actvn, W, b): - x = W.dot(actvn) + b - return x * (x > 0) - - -def d_relu(x): - return x > 0 - - -class Network(object): - def __init__(self, depth, n_embed, n_hidden, n_labels, n_vocab, rho=1e-4, lr=0.005): - self.depth = depth - self.n_embed = n_embed - self.n_hidden = n_hidden - self.n_labels = n_labels - self.n_vocab = n_vocab - - self.params = Params.random(depth, n_embed, n_hidden, n_labels, n_vocab) - self.gradient = Params.zero(depth, n_embed, n_hidden, n_labels, n_vocab) - self.adagrad = Adagrad(self.params.data.shape, lr) - self.seen_words = {} - - self.pred = zeros(self.n_labels) - self.actvn = zeros((self.depth, self.n_hidden)) - self.input_vector = zeros((self.n_embed, )) - - def forward(self, word_ids, embeddings): - self.input_vector.fill(0) - self.input_vector += sum(embeddings) - # Apply the fine-tuning we've learned - for id_ in word_ids: - if id_ < self.n_vocab: - self.input_vector += self.params.E[id_] - # Average - self.input_vector /= len(embeddings) - prev = self.input_vector - for i in range(self.depth): - self.actvn[i] = relu(prev, self.params.W[i], self.params.b[i]) - return x * (x > 0) - - - prev = self.actvn[i] - self.pred = softmax(self.actvn[-1], self.params.W[-1], self.params.b[-1]) - return argmax(self.pred) - - def backward(self, word_ids, label): - target = zeros(self.n_labels) - target[label] = 1.0 - D = self.pred - target - - for i in range(self.depth, 0, -1): - self.gradient.b[i] += D - self.gradient.W[i] += outer(D, self.actvn[i-1]) - D = d_relu(self.actvn[i-1]) * self.params.W[i].T.dot(D) - - self.gradient.b[0] += D - self.gradient.W[0] += outer(D, self.input_vector) - - grad = self.params.W[0].T.dot(D).reshape((self.n_embed,)) / len(word_ids) - for word_id in word_ids: - if word_id < self.n_vocab: - self.gradient.E[word_id] += grad - self.seen_words[word_id] = self.seen_words.get(word_id, 0) + 1 - - def update(self, rho, n): - # L2 Regularization - for i in range(self.depth): - self.gradient.W[i] += self.params.W[i] * rho - self.gradient.b[i] += self.params.b[i] * rho - # Do word embedding tuning - for word_id, freq in self.seen_words.items(): - self.gradient.E[word_id] += (self.params.E[word_id] * freq) * rho - - update = self.gradient.data / n - update = self.adagrad.rescale(update) - self.params.data -= update - self.gradient.data.fill(0) - self.seen_words = {} - - -def get_words(doc, dropout_rate, n_vocab): - mask = random.rand(len(doc)) > dropout_rate - word_ids = [] - embeddings = [] - for word in doc: - if mask[word.i] and not word.is_punct: - embeddings.append(word.vector) - word_ids.append(word.orth) - # all examples must have at least one word - if not embeddings: - return [w.orth for w in doc], [w.vector for w in doc] - else: - return word_ids, embeddings - - -def train(dataset, n_embed, n_hidden, n_labels, n_vocab, depth, dropout_rate, rho, - n_iter, save_model): - model = Network(depth, n_embed, n_hidden, n_labels, n_vocab) - best_acc = 0 - for epoch in range(n_iter): - train_score = Scorer() - # create mini-batches - for batch in dataset.batches(dataset.train): - for doc, label in batch: - if len(doc) == 0: - continue - word_ids, embeddings = get_words(doc, dropout_rate, n_vocab) - guess = model.forward(word_ids, embeddings) - model.backward(word_ids, label) - train_score += guess == label - model.update(rho, len(batch)) - test_score = Scorer() - for doc, label in dataset.dev: - word_ids, embeddings = get_words(doc, 0.0, n_vocab) - guess = model.forward(word_ids, embeddings) - test_score += guess == label - if test_score.true >= best_acc: - best_acc = test_score.true - save_model(epoch, model.params.data) - print "%d\t%s\t%s" % (epoch, train_score, test_score) - return model diff --git a/examples/sentiment/util.py b/examples/sentiment/util.py deleted file mode 100644 index 6e3f71723..000000000 --- a/examples/sentiment/util.py +++ /dev/null @@ -1,14 +0,0 @@ -class Scorer(object): - def __init__(self): - self.true = 0 - self.total = 0 - - def __iadd__(self, is_correct): - self.true += is_correct - self.total += 1 - return self - - def __str__(self): - return '%.3f' % (self.true / self.total) - - diff --git a/examples/spacy_dynet_lstm.py b/examples/spacy_dynet_lstm.py deleted file mode 100644 index 4ddc7d2a7..000000000 --- a/examples/spacy_dynet_lstm.py +++ /dev/null @@ -1,246 +0,0 @@ -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division - -import pathlib -import plac -import random -from collections import Counter -import numpy as np -import os - -from collections import defaultdict -from itertools import count - -if os.environ.get('DYNET_GPU') == '1': - import _gdynet as dynet - from _gdynet import cg -else: - import dynet - from dynet import cg - - -class Vocab: - def __init__(self, w2i=None): - if w2i is None: w2i = defaultdict(count(0).next) - self.w2i = dict(w2i) - self.i2w = {i:w for w,i in w2i.iteritems()} - @classmethod - def from_corpus(cls, corpus): - w2i = defaultdict(count(0).next) - for sent in corpus: - [w2i[word] for word in sent] - return Vocab(w2i) - - def size(self): - return len(self.w2i.keys()) - - -def read_data(path): - with path.open() as file_: - sent = [] - for line in file_: - line = line.strip().split() - if not line: - if sent: - yield sent - sent = [] - else: - pieces = line - w = pieces[1] - pos = pieces[3] - sent.append((w, pos)) - - -def get_vocab(train, test): - words = [] - tags = [] - wc = Counter() - for s in train: - for w, p in s: - words.append(w) - tags.append(p) - wc[w] += 1 - words.append("_UNK_") - #words=[w if wc[w] > 1 else "_UNK_" for w in words] - tags.append("_START_") - - for s in test: - for w, p in s: - words.append(w) - vw = Vocab.from_corpus([words]) - vt = Vocab.from_corpus([tags]) - return words, tags, wc, vw, vt - - -class BiTagger(object): - def __init__(self, vw, vt, nwords, ntags): - self.vw = vw - self.vt = vt - self.nwords = nwords - self.ntags = ntags - - self.UNK = self.vw.w2i["_UNK_"] - - self._model = dynet.Model() - self._sgd = dynet.SimpleSGDTrainer(self._model) - - self._E = self._model.add_lookup_parameters((self.nwords, 128)) - self._p_t1 = self._model.add_lookup_parameters((self.ntags, 30)) - - self._pH = self._model.add_parameters((32, 50*2)) - self._pO = self._model.add_parameters((self.ntags, 32)) - - self._fwd_lstm = dynet.LSTMBuilder(1, 128, 50, self._model) - self._bwd_lstm = dynet.LSTMBuilder(1, 128, 50, self._model) - self._words_batch = [] - self._tags_batch = [] - self._minibatch_size = 32 - - def __call__(self, words): - dynet.renew_cg() - word_ids = [self.vw.w2i.get(w, self.UNK) for w in words] - wembs = [self._E[w] for w in word_ids] - - f_state = self._fwd_lstm.initial_state() - b_state = self._bwd_lstm.initial_state() - - fw = [x.output() for x in f_state.add_inputs(wembs)] - bw = [x.output() for x in b_state.add_inputs(reversed(wembs))] - - H = dynet.parameter(self._pH) - O = dynet.parameter(self._pO) - - tags = [] - for i, (f, b) in enumerate(zip(fw, reversed(bw))): - r_t = O * (dynet.tanh(H * dynet.concatenate([f, b]))) - out = dynet.softmax(r_t) - tags.append(self.vt.i2w[np.argmax(out.npvalue())]) - return tags - - def predict_batch(self, words_batch): - dynet.renew_cg() - length = max(len(words) for words in words_batch) - word_ids = np.zeros((length, len(words_batch)), dtype='int32') - for j, words in enumerate(words_batch): - for i, word in enumerate(words): - word_ids[i, j] = self.vw.w2i.get(word, self.UNK) - wembs = [dynet.lookup_batch(self._E, word_ids[i]) for i in range(length)] - - f_state = self._fwd_lstm.initial_state() - b_state = self._bwd_lstm.initial_state() - - fw = [x.output() for x in f_state.add_inputs(wembs)] - bw = [x.output() for x in b_state.add_inputs(reversed(wembs))] - - H = dynet.parameter(self._pH) - O = dynet.parameter(self._pO) - - tags_batch = [[] for _ in range(len(words_batch))] - for i, (f, b) in enumerate(zip(fw, reversed(bw))): - r_t = O * (dynet.tanh(H * dynet.concatenate([f, b]))) - out = dynet.softmax(r_t).npvalue() - for j in range(len(words_batch)): - tags_batch[j].append(self.vt.i2w[np.argmax(out.T[j])]) - return tags_batch - - def pipe(self, sentences): - batch = [] - for words in sentences: - batch.append(words) - if len(batch) == self._minibatch_size: - tags_batch = self.predict_batch(batch) - for words, tags in zip(batch, tags_batch): - yield tags - batch = [] - - def update(self, words, tags): - self._words_batch.append(words) - self._tags_batch.append(tags) - if len(self._words_batch) == self._minibatch_size: - loss = self.update_batch(self._words_batch, self._tags_batch) - self._words_batch = [] - self._tags_batch = [] - else: - loss = 0 - return loss - - def update_batch(self, words_batch, tags_batch): - dynet.renew_cg() - length = max(len(words) for words in words_batch) - word_ids = np.zeros((length, len(words_batch)), dtype='int32') - for j, words in enumerate(words_batch): - for i, word in enumerate(words): - word_ids[i, j] = self.vw.w2i.get(word, self.UNK) - tag_ids = np.zeros((length, len(words_batch)), dtype='int32') - for j, tags in enumerate(tags_batch): - for i, tag in enumerate(tags): - tag_ids[i, j] = self.vt.w2i.get(tag, self.UNK) - wembs = [dynet.lookup_batch(self._E, word_ids[i]) for i in range(length)] - wembs = [dynet.noise(we, 0.1) for we in wembs] - - f_state = self._fwd_lstm.initial_state() - b_state = self._bwd_lstm.initial_state() - - fw = [x.output() for x in f_state.add_inputs(wembs)] - bw = [x.output() for x in b_state.add_inputs(reversed(wembs))] - - H = dynet.parameter(self._pH) - O = dynet.parameter(self._pO) - - errs = [] - for i, (f, b) in enumerate(zip(fw, reversed(bw))): - f_b = dynet.concatenate([f,b]) - r_t = O * (dynet.tanh(H * f_b)) - err = dynet.pickneglogsoftmax_batch(r_t, tag_ids[i]) - errs.append(dynet.sum_batches(err)) - sum_errs = dynet.esum(errs) - squared = -sum_errs # * sum_errs - losses = sum_errs.scalar_value() - sum_errs.backward() - self._sgd.update() - return losses - - -def main(train_loc, dev_loc, model_dir): - train_loc = pathlib.Path(train_loc) - dev_loc = pathlib.Path(dev_loc) - - train = list(read_data((train_loc))) - test = list(read_data(dev_loc)) - - words, tags, wc, vw, vt = get_vocab(train, test) - - UNK = vw.w2i["_UNK_"] - nwords = vw.size() - ntags = vt.size() - - tagger = BiTagger(vw, vt, nwords, ntags) - - tagged = loss = 0 - - for ITER in xrange(1): - random.shuffle(train) - for i, s in enumerate(train,1): - if i % 5000 == 0: - tagger._sgd.status() - print(loss / tagged) - loss = 0 - tagged = 0 - if i % 10000 == 0: - good = bad = 0.0 - word_sents = [[w for w, t in sent] for sent in test] - gold_sents = [[t for w, t in sent] for sent in test] - for words, tags, golds in zip(words, tagger.pipe(words), gold_sents): - for go, gu in zip(golds, tags): - if go == gu: - good += 1 - else: - bad += 1 - print(good / (good+bad)) - loss += tagger.update([w for w, t in s], [t for w, t in s]) - tagged += len(s) - - -if __name__ == '__main__': - plac.call(main)