Merge branch 'master' of ssh://github.com/explosion/spaCy

This commit is contained in:
Matthew Honnibal 2016-10-21 00:00:15 +02:00
commit e16e78a737
3 changed files with 20 additions and 9 deletions

View File

@ -7,6 +7,7 @@ import cytoolz
import numpy import numpy
from keras.models import Sequential, model_from_json from keras.models import Sequential, model_from_json
from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional from keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from keras.layers import TimeDistributed
from keras.optimizers import Adam from keras.optimizers import Adam
import cPickle as pickle import cPickle as pickle
@ -48,10 +49,16 @@ class SentimentAnalyser(object):
def get_features(docs, max_length): def get_features(docs, max_length):
Xs = numpy.zeros((len(list(docs)), max_length), dtype='int32') docs = list(docs)
Xs = numpy.zeros((len(docs), max_length), dtype='int32')
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
for j, token in enumerate(doc[:max_length]): j = 0
Xs[i, j] = token.rank if token.has_vector else 0 for token in doc:
if token.has_vector and not token.is_punct and not token.is_space:
Xs[i, j] = token.rank + 1
j += 1
if j >= max_length:
break
return Xs return Xs
@ -75,9 +82,12 @@ def compile_lstm(embeddings, shape, settings):
embeddings.shape[1], embeddings.shape[1],
input_length=shape['max_length'], input_length=shape['max_length'],
trainable=False, trainable=False,
weights=[embeddings] weights=[embeddings],
mask_zero=True
) )
) )
model.add(TimeDistributed(Dense(shape['nr_hidden'] * 2)))
model.add(Dropout(settings['dropout']))
model.add(Bidirectional(LSTM(shape['nr_hidden']))) model.add(Bidirectional(LSTM(shape['nr_hidden'])))
model.add(Dropout(settings['dropout'])) model.add(Dropout(settings['dropout']))
model.add(Dense(shape['nr_class'], activation='sigmoid')) model.add(Dense(shape['nr_class'], activation='sigmoid'))
@ -87,11 +97,11 @@ def compile_lstm(embeddings, shape, settings):
def get_embeddings(vocab): def get_embeddings(vocab):
max_rank = max(lex.rank for lex in vocab if lex.has_vector) max_rank = max(lex.rank+1 for lex in vocab if lex.has_vector)
vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32') vectors = numpy.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
for lex in vocab: for lex in vocab:
if lex.has_vector: if lex.has_vector:
vectors[lex.rank] = lex.vector vectors[lex.rank + 1] = lex.vector
return vectors return vectors

View File

@ -202,7 +202,8 @@ def setup_package():
'six', 'six',
'cloudpickle', 'cloudpickle',
'pathlib', 'pathlib',
'sputnik>=0.9.2,<0.10.0'], 'sputnik>=0.9.2,<0.10.0',
'ujson>=1.35'],
classifiers=[ classifiers=[
'Development Status :: 5 - Production/Stable', 'Development Status :: 5 - Production/Stable',
'Environment :: Console', 'Environment :: Console',

View File

@ -133,8 +133,8 @@ cdef class Span:
property vector_norm: property vector_norm:
def __get__(self): def __get__(self):
if 'vector_norm' in self.doc.getters_for_spans: if 'vector_norm' in self.doc.user_span_hooks:
return self.doc.getters_for_spans['vector'](self) return self.doc.user_span_hooks['vector'](self)
cdef float value cdef float value
if self._vector_norm is None: if self._vector_norm is None:
self._vector_norm = 1e-20 self._vector_norm = 1e-20