Update examples

This commit is contained in:
ines 2017-11-07 01:22:30 +01:00
parent 1b1c9105b4
commit 173b1551af
16 changed files with 42 additions and 39 deletions

View File

@ -1,18 +1,24 @@
import plac
import collections
import random
"""
This example shows how to use an LSTM sentiment classification model trained using Keras in spaCy. spaCy splits the document into sentences, and each sentence is classified using the LSTM. The scores for the sentences are then aggregated to give the document score. This kind of hierarchical model is quite difficult in "pure" Keras or Tensorflow, but it's very effective. The Keras example on this dataset performs quite poorly, because it cuts off the documents so that they're a fixed size. This hurts review accuracy a lot, because people often summarise their rating in the final sentence
Prerequisites:
spacy download en_vectors_web_lg
pip install keras==2.0.9
Compatible with: spaCy v2.0.0+
"""
import plac
import random
import pathlib
import cytoolz
import numpy
from keras.models import Sequential, model_from_json
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from keras.layers import LSTM, Dense, Embedding, Bidirectional
from keras.layers import TimeDistributed
from keras.optimizers import Adam
from spacy.compat import pickle
import thinc.extra.datasets
from spacy.compat import pickle
import spacy
@ -84,8 +90,8 @@ def get_features(docs, max_length):
def train(train_texts, train_labels, dev_texts, dev_labels,
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
by_sentence=True):
lstm_shape, lstm_settings, lstm_optimizer, batch_size=100,
nb_epoch=5, by_sentence=True):
print("Loading spaCy")
nlp = spacy.load('en_vectors_web_lg')
nlp.add_pipe(nlp.create_pipe('sentencizer'))
@ -97,7 +103,7 @@ def train(train_texts, train_labels, dev_texts, dev_labels,
if by_sentence:
train_docs, train_labels = get_labelled_sentences(train_docs, train_labels)
dev_docs, dev_labels = get_labelled_sentences(dev_docs, dev_labels)
train_X = get_features(train_docs, lstm_shape['max_length'])
dev_X = get_features(dev_docs, lstm_shape['max_length'])
model.fit(train_X, train_labels, validation_data=(dev_X, dev_labels),
@ -138,12 +144,12 @@ def evaluate(model_dir, texts, labels, max_length=100):
'''
return [nlp.tagger, nlp.parser, SentimentAnalyser.load(model_dir, nlp,
max_length=max_length)]
nlp = spacy.load('en')
nlp.pipeline = create_pipeline(nlp)
correct = 0
i = 0
i = 0
for doc in nlp.pipe(texts, batch_size=1000, n_threads=4):
correct += bool(doc.sentiment >= 0.5) == bool(labels[i])
i += 1

View File

@ -6,7 +6,7 @@ money and currency values (entities labelled as MONEY) and then check the
dependency tree to find the noun phrase they are referring to for example:
$9.4 million --> Net income.
Compatible with: spaCy 2.0.0a18+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -16,7 +16,7 @@ show you how computers understand [language]
I'm assuming that we can use the token.head to build these groups."
Compatible with: spaCy 2.0.0a18+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -34,7 +34,7 @@ formatted in jsonl as a sequence of entries like this:
{"text":"Appalachia"}
{"text":"Argentina"}
Compatible with: spaCy 2.0.0a17+
Compatible with: spaCy v2.0.0+
"""
from __future__ import print_function, unicode_literals, division

View File

@ -7,7 +7,7 @@ they're called on is passed in as the first argument.
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Compatible with: spaCy 2.0.0a17+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -8,7 +8,7 @@ coordinates. Can be extended with more details from the API.
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Compatible with: spaCy 2.0.0a17+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -8,7 +8,7 @@ respectively.
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Compatible with: spaCy 2.0.0a17+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -6,7 +6,7 @@ each "sentence" on a newline, and spaces between tokens. Data is loaded from
the IMDB movie reviews dataset and will be loaded automatically via Thinc's
built-in dataset loader.
Compatible with: spaCy 2.0.0a18+
Compatible with: spaCy v2.0.0+
"""
from __future__ import print_function, unicode_literals
from toolz import partition_all

View File

@ -15,7 +15,7 @@ following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
('hotel', 'PLACE', 'show') --> show PLACE hotel
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
Compatible with: spaCy 2.0.0a20+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -7,7 +7,7 @@ For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy 2.0.0a20+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -23,7 +23,7 @@ For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities
Compatible with: spaCy 2.0.0a20+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -5,7 +5,7 @@ model or a blank model. For more details, see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
Compatible with: spaCy 2.0.0a20+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -8,7 +8,7 @@ the documentation:
* Training: https://alpha.spacy.io/usage/training
* POS Tagging: https://alpha.spacy.io/usage/linguistic-features#pos-tagging
Compatible with: spaCy 2.0.0a20+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

View File

@ -8,7 +8,7 @@ see the documentation:
* Training: https://alpha.spacy.io/usage/training
* Text classification: https://alpha.spacy.io/usage/text-classification
Compatible with: spaCy 2.0.0a20+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function
import plac

View File

@ -2,7 +2,7 @@
# coding: utf8
"""Load vectors for a language trained using fastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
Compatible with: spaCy v2.0.0a17+
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals
import plac

View File

@ -165,18 +165,15 @@ include ../_includes/_mixins
+h(3, "keras") Text classification with Keras
p
| In this example, we're using spaCy to pre-process text for use with
| a #[+a("https://keras.io") Keras] text classification model.
| This example shows how to use a #[+a("https://keras.io") Keras]
| LSTM sentiment classification model in spaCy. spaCy splits
| the document into sentences, and each sentence is classified using
| the LSTM. The scores for the sentences are then aggregated to give
| the document score. This kind of hierarchical model is quite
| difficult in "pure" Keras or Tensorflow, but it's very effective.
| The Keras example on this dataset performs quite poorly, because it
| cuts off the documents so that they're a fixed size. This hurts
| review accuracy a lot, because people often summarise their rating
| in the final sentence.
+github("spacy", "examples/deep_learning_keras.py")
+h(3, "keras-parikh-entailment") A decomposable attention model for Natural Language Inference
p
| This example contains an implementation of the entailment prediction
| model described by #[+a("https://arxiv.org/pdf/1606.01933.pdf") Parikh et al. (2016)].
| The model is notable for its competitive performance with very few
| parameters, and was implemented using #[+a("https://keras.io") Keras]
| and spaCy.
+github("spacy", "examples/keras_parikh_entailment/__main__.py", false, "examples/keras_parikh_entailment")