From 409a18bd4251a219842b6a15eb30ce9039eda431 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 20 Nov 2016 03:35:23 +0100
Subject: [PATCH] Add paddle sentiment example

---
 examples/paddle/sentiment_bilstm/__main__.py | 31 +++++++++++++++++
 examples/paddle/sentiment_bilstm/config.py   | 36 ++++++++++++++++++++
 examples/paddle/sentiment_bilstm/networks.py | 19 +++++++++++
 3 files changed, 86 insertions(+)
 create mode 100644 examples/paddle/sentiment_bilstm/__main__.py
 create mode 100644 examples/paddle/sentiment_bilstm/config.py
 create mode 100644 examples/paddle/sentiment_bilstm/networks.py

diff --git a/examples/paddle/sentiment_bilstm/__main__.py b/examples/paddle/sentiment_bilstm/__main__.py
new file mode 100644
index 000000000..845443d99
--- /dev/null
+++ b/examples/paddle/sentiment_bilstm/__main__.py
@@ -0,0 +1,31 @@
+def write_parameter(outfile, feats):
+    """
+    From https://github.com/baidu/Paddle/issues/490
+
+    outfile: Output file name with string type. **Note**, it should be the same as it in the above config.
+    feats: Parameter with float type.
+    """
+    version = 0
+    value_size  = 4; # means float type
+    ret = b""
+    for feat in feats:
+        ret += feat.tostring()
+    size = len(ret) / 4
+    fo = open(outfile, 'wb')
+    fo.write(struct.pack('iIQ', version, value_size, size))
+    fo.write(ret)
+
+
+# config=trainer_config.py
+# output=./model_output
+# paddle train --config=$config \
+#              --save_dir=$output \
+#              --job=train \
+#              --use_gpu=false \
+#              --trainer_count=4 \
+#              --num_passes=10 \
+#              --log_period=20 \
+#              --dot_period=20 \
+#              --show_parameter_stats_period=100 \
+#              --test_all_data_in_one_period=1 \
+#              2>&1 | tee 'train.log'
diff --git a/examples/paddle/sentiment_bilstm/config.py b/examples/paddle/sentiment_bilstm/config.py
new file mode 100644
index 000000000..cdee7cdf9
--- /dev/null
+++ b/examples/paddle/sentiment_bilstm/config.py
@@ -0,0 +1,36 @@
+from paddle.trainer.PyDataProvider2 import *
+
+
+def get_features(doc):
+    return numpy.asarray(
+        [t.rank+1 for t in doc
+         if t.has_vector and not t.is_punct and not t.is_space],
+        dtype='int32')
+
+
+def on_init(settings, lang_name, **kwargs):
+    print("Loading spaCy")
+    nlp = spacy.load('en', entity=False)
+    vectors = get_vectors(nlp)
+    settings.input_types = [
+        # The text is a sequence of integer values, and each value is a word id.
+        # The whole sequence is the sentences that we want to predict its
+        # sentimental.
+        integer_value(vectors.shape[0], seq_type=SequenceType),  # text input
+
+        # label positive/negative
+        integer_value(2)
+    ]
+    settings.nlp = nlp
+    settings.vectors = vectors
+
+
+@provider(init_hook=on_init)
+def process(settings, data_dir):  # settings is not used currently.
+    texts, labels = read_data(data_dir)
+    for doc, label in zip(nlp.pipe(train_texts, batch_size=5000, n_threads=3),
+                          labels):
+        for sent in doc.sents:
+            ids = get_features(sent)
+            # give data to paddle.
+            yield ids, label
diff --git a/examples/paddle/sentiment_bilstm/networks.py b/examples/paddle/sentiment_bilstm/networks.py
new file mode 100644
index 000000000..84e9732c0
--- /dev/null
+++ b/examples/paddle/sentiment_bilstm/networks.py
@@ -0,0 +1,19 @@
+from paddle.trainer_config_helpers import *
+
+
+def bidirectional_lstm_net(input_dim,
+                           class_dim=2,
+                           emb_dim=128,
+                           lstm_dim=128,
+                           is_predict=False):
+    data = data_layer("word", input_dim)
+    emb = embedding_layer(input=data, size=emb_dim)
+    bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
+    dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+    output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
+
+    if not is_predict:
+        lbl = data_layer("label", 1)
+        outputs(classification_cost(input=output, label=lbl))
+    else:
+        outputs(output)