From 409a18bd4251a219842b6a15eb30ce9039eda431 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 20 Nov 2016 03:35:23 +0100 Subject: [PATCH] Add paddle sentiment example --- examples/paddle/sentiment_bilstm/__main__.py | 31 +++++++++++++++++ examples/paddle/sentiment_bilstm/config.py | 36 ++++++++++++++++++++ examples/paddle/sentiment_bilstm/networks.py | 19 +++++++++++ 3 files changed, 86 insertions(+) create mode 100644 examples/paddle/sentiment_bilstm/__main__.py create mode 100644 examples/paddle/sentiment_bilstm/config.py create mode 100644 examples/paddle/sentiment_bilstm/networks.py diff --git a/examples/paddle/sentiment_bilstm/__main__.py b/examples/paddle/sentiment_bilstm/__main__.py new file mode 100644 index 000000000..845443d99 --- /dev/null +++ b/examples/paddle/sentiment_bilstm/__main__.py @@ -0,0 +1,31 @@ +def write_parameter(outfile, feats): + """ + From https://github.com/baidu/Paddle/issues/490 + + outfile: Output file name with string type. **Note**, it should be the same as it in the above config. + feats: Parameter with float type. + """ + version = 0 + value_size = 4; # means float type + ret = b"" + for feat in feats: + ret += feat.tostring() + size = len(ret) / 4 + fo = open(outfile, 'wb') + fo.write(struct.pack('iIQ', version, value_size, size)) + fo.write(ret) + + +# config=trainer_config.py +# output=./model_output +# paddle train --config=$config \ +# --save_dir=$output \ +# --job=train \ +# --use_gpu=false \ +# --trainer_count=4 \ +# --num_passes=10 \ +# --log_period=20 \ +# --dot_period=20 \ +# --show_parameter_stats_period=100 \ +# --test_all_data_in_one_period=1 \ +# 2>&1 | tee 'train.log' diff --git a/examples/paddle/sentiment_bilstm/config.py b/examples/paddle/sentiment_bilstm/config.py new file mode 100644 index 000000000..cdee7cdf9 --- /dev/null +++ b/examples/paddle/sentiment_bilstm/config.py @@ -0,0 +1,36 @@ +from paddle.trainer.PyDataProvider2 import * + + +def get_features(doc): + return numpy.asarray( + [t.rank+1 for t in doc + if t.has_vector and not t.is_punct and not t.is_space], + dtype='int32') + + +def on_init(settings, lang_name, **kwargs): + print("Loading spaCy") + nlp = spacy.load('en', entity=False) + vectors = get_vectors(nlp) + settings.input_types = [ + # The text is a sequence of integer values, and each value is a word id. + # The whole sequence is the sentences that we want to predict its + # sentimental. + integer_value(vectors.shape[0], seq_type=SequenceType), # text input + + # label positive/negative + integer_value(2) + ] + settings.nlp = nlp + settings.vectors = vectors + + +@provider(init_hook=on_init) +def process(settings, data_dir): # settings is not used currently. + texts, labels = read_data(data_dir) + for doc, label in zip(nlp.pipe(train_texts, batch_size=5000, n_threads=3), + labels): + for sent in doc.sents: + ids = get_features(sent) + # give data to paddle. + yield ids, label diff --git a/examples/paddle/sentiment_bilstm/networks.py b/examples/paddle/sentiment_bilstm/networks.py new file mode 100644 index 000000000..84e9732c0 --- /dev/null +++ b/examples/paddle/sentiment_bilstm/networks.py @@ -0,0 +1,19 @@ +from paddle.trainer_config_helpers import * + + +def bidirectional_lstm_net(input_dim, + class_dim=2, + emb_dim=128, + lstm_dim=128, + is_predict=False): + data = data_layer("word", input_dim) + emb = embedding_layer(input=data, size=emb_dim) + bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim) + dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5) + output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation()) + + if not is_predict: + lbl = data_layer("label", 1) + outputs(classification_cost(input=output, label=lbl)) + else: + outputs(output)