Update examples

2017-11-07 01:22:30 +01:00 · 2017-11-07 01:22:30 +01:00 · 173b1551af
parent 1b1c9105b4
commit 173b1551af
16 changed files with 42 additions and 39 deletions
--- a/examples/deep_learning_keras.py
+++ b/examples/deep_learning_keras.py
@ -1,18 +1,24 @@
-import plac
-import collections
-import random
+"""
+This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence

+Prerequisites:
+spacy download en_vectors_web_lg
+pip install keras==2.0.9
+
+Compatible with: spaCy v2.0.0+
+"""
+
+import plac
+import random
 import pathlib
 import cytoolz
 import numpy
 from keras.models import Sequential, model_from_json
-from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
+from keras.layers import LSTM, Dense, Embedding, Bidirectional
 from keras.layers import TimeDistributed
 from keras.optimizers import Adam
-from spacy.compat import pickle
-
 import thinc.extra.datasets
-
+from spacy.compat import pickle
 import spacy


@ -84,8 +90,8 @@ def get_features(docs, max_length):


 def train(train_texts, train_labels, dev_texts, dev_labels,
-        lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
-        by_sentence=True):
+          lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
+          nb_epoch=5, by_sentence=True):
    print("Loading spaCy")
    nlp = spacy.load('en_vectors_web_lg')
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
@ -97,7 +103,7 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
    if by_sentence:
        train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
        dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
-        
+
    train_X = get_features(train_docs, lstm_shape['max_length'])
    dev_X = get_features(dev_docs, lstm_shape['max_length'])
    model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
@ -138,12 +144,12 @@ def evaluate(model_dir, texts, labels, max_length=100):
        '''
        return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
                                                               max_length=max_length)]
-    
+
    nlp = spacy.load('en')
    nlp.pipeline = create_pipeline(nlp)

    correct = 0
-    i = 0 
+    i = 0
    for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
        correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
        i += 1
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@ -6,7 +6,7 @@ money and currency values (entities labelled as MONEY) and then check the
 dependency tree to find the noun phrase they are referring to – for example:
 $9.4 million --> Net income.

-Compatible with: spaCy 2.0.0a18+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/information_extraction/parse_subtrees.py
+++ b/examples/information_extraction/parse_subtrees.py
@ -16,7 +16,7 @@ show you how computers understand [language]

 I'm assuming that we can use the token.head to build these groups."

-Compatible with: spaCy 2.0.0a18+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/information_extraction/phrase_matcher.py
+++ b/examples/information_extraction/phrase_matcher.py
@ -34,7 +34,7 @@ formatted in jsonl as a sequence of entries like this:
 {"text":"Appalachia"}
 {"text":"Argentina"}

-Compatible with: spaCy 2.0.0a17+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import print_function, unicode_literals, division

--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -7,7 +7,7 @@ they're called on is passed in as the first argument.

 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components

-Compatible with: spaCy 2.0.0a17+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -8,7 +8,7 @@ coordinates. Can be extended with more details from the API.
 * REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components

-Compatible with: spaCy 2.0.0a17+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -8,7 +8,7 @@ respectively.

 * Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components

-Compatible with: spaCy 2.0.0a17+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/pipeline/multi_processing.py
+++ b/examples/pipeline/multi_processing.py
@ -6,7 +6,7 @@ each "sentence" on a newline, and spaces between tokens. Data is loaded from
 the IMDB movie reviews dataset and will be loaded automatically via Thinc's
 built-in dataset loader.

-Compatible with: spaCy 2.0.0a18+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import print_function, unicode_literals
 from toolz import partition_all
--- a/examples/training/train_intent_parser.py
+++ b/examples/training/train_intent_parser.py
@ -15,7 +15,7 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
 ('hotel', 'PLACE', 'show') --> show PLACE hotel
 ('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin

-Compatible with: spaCy 2.0.0a20+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_ner.py
+++ b/examples/training/train_ner.py
@ -7,7 +7,7 @@ For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities

-Compatible with: spaCy 2.0.0a20+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -23,7 +23,7 @@ For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * NER: https://alpha.spacy.io/usage/linguistic-features#named-entities

-Compatible with: spaCy 2.0.0a20+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -5,7 +5,7 @@ model or a blank model. For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse

-Compatible with: spaCy 2.0.0a20+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -8,7 +8,7 @@ the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging

-Compatible with: spaCy 2.0.0a20+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function

--- a/examples/training/train_textcat.py
+++ b/examples/training/train_textcat.py
@ -8,7 +8,7 @@ see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Text classification: https://alpha.spacy.io/usage/text-classification

-Compatible with: spaCy 2.0.0a20+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals, print_function
 import plac
--- a/examples/vectors_fast_text.py
+++ b/examples/vectors_fast_text.py
@ -2,7 +2,7 @@
 # coding: utf8
 """Load vectors for a language trained using fastText
 https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
-Compatible with: spaCy v2.0.0a17+
+Compatible with: spaCy v2.0.0+
 """
 from __future__ import unicode_literals
 import plac
--- a/website/usage/examples.jade
+++ b/website/usage/examples.jade
@ -165,18 +165,15 @@ include ../_includes/_mixins
    +h(3, "keras") Text classification with Keras

    p
-        |  In this example, we're using spaCy to pre-process text for use with
-        |  a #[+a("https://keras.io") Keras] text classification model.
+        |  This example shows how to use a #[+a("https://keras.io") Keras]
+        |  LSTM sentiment classification model in spaCy. spaCy splits
+        |  the document into sentences, and each sentence is classified using
+        |  the LSTM. The scores for the sentences are then aggregated to give
+        |  the document score. This kind of hierarchical model is quite
+        |  difficult in "pure" Keras or Tensorflow, but it's very effective.
+        |  The Keras example on this dataset performs quite poorly, because it
+        |  cuts off the documents so that they're a fixed size. This hurts
+        |  review accuracy a lot, because people often summarise their rating
+        |  in the final sentence.

    +github("spacy", "examples/deep_learning_keras.py")
-
-    +h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
-
-    p
-        |  This example contains an implementation of the entailment prediction
-        |  model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
-        |  The model is notable for its competitive performance with very few
-        |  parameters, and was implemented using #[+a("https://keras.io") Keras]
-        |  and spaCy.
-
-    +github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment")