diff --git a/spacy/_bu_nn.pyx b/spacy/_bu_nn.pyx deleted file mode 100644 index ae875b235..000000000 --- a/spacy/_bu_nn.pyx +++ /dev/null @@ -1,490 +0,0 @@ -"""Feed-forward neural network, using Thenao.""" - -import os -import sys -import time - -import numpy - -import theano -import theano.tensor as T -import gzip -import cPickle - - -def load_data(dataset): - ''' Loads the dataset - - :type dataset: string - :param dataset: the path to the dataset (here MNIST) - ''' - - ############# - # LOAD DATA # - ############# - - # Download the MNIST dataset if it is not present - data_dir, data_file = os.path.split(dataset) - if data_dir == "" and not os.path.isfile(dataset): - # Check if dataset is in the data directory. - new_path = os.path.join( - os.path.split(__file__)[0], - "..", - "data", - dataset - ) - if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz': - dataset = new_path - - if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz': - import urllib - origin = ( - 'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz' - ) - print 'Downloading data from %s' % origin - urllib.urlretrieve(origin, dataset) - - print '... loading data' - - # Load the dataset - f = gzip.open(dataset, 'rb') - train_set, valid_set, test_set = cPickle.load(f) - f.close() - #train_set, valid_set, test_set format: tuple(input, target) - #input is an numpy.ndarray of 2 dimensions (a matrix), - #each row corresponding to an example. target is a - #numpy.ndarray of 1 dimension (vector)) that have the same length as - #the number of rows in the input. It should give the target - #target to the example with the same index in the input. - - def shared_dataset(data_xy, borrow=True): - """ Function that loads the dataset into shared variables - - The reason we store our dataset in shared variables is to allow - Theano to copy it into the GPU memory (when code is run on GPU). - Since copying data into the GPU is slow, copying a minibatch everytime - is needed (the default behaviour if the data is not in a shared - variable) would lead to a large decrease in performance. - """ - data_x, data_y = data_xy - shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), - borrow=borrow) - shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), - borrow=borrow) - # When storing data on the GPU it has to be stored as floats - # therefore we will store the labels as ``floatX`` as well - # (``shared_y`` does exactly that). But during our computations - # we need them as ints (we use labels as index, and if they are - # floats it doesn't make sense) therefore instead of returning - # ``shared_y`` we will have to cast it to int. This little hack - # lets ous get around this issue - return shared_x, T.cast(shared_y, 'int32') - - test_set_x, test_set_y = shared_dataset(test_set) - valid_set_x, valid_set_y = shared_dataset(valid_set) - train_set_x, train_set_y = shared_dataset(train_set) - - rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), - (test_set_x, test_set_y)] - return rval - - -class LogisticRegression(object): - """Multi-class Logistic Regression Class - - The logistic regression is fully described by a weight matrix :math:`W` - and bias vector :math:`b`. Classification is done by projecting data - points onto a set of hyperplanes, the distance to which is used to - determine a class membership probability. - """ - - def __init__(self, input, n_in, n_out): - """ Initialize the parameters of the logistic regression - - :type input: theano.tensor.TensorType - :param input: symbolic variable that describes the input of the - architecture (one minibatch) - - :type n_in: int - :param n_in: number of input units, the dimension of the space in - which the datapoints lie - - :type n_out: int - :param n_out: number of output units, the dimension of the space in - which the labels lie - - """ - # start-snippet-1 - # initialize with 0 the weights W as a matrix of shape (n_in, n_out) - self.W = theano.shared( - value=numpy.zeros((n_in, n_out), - dtype=theano.config.floatX - ), - name='W', - borrow=True - ) - # initialize the baises b as a vector of n_out 0s - self.b = theano.shared( - value=numpy.zeros( - (n_out,), - dtype=theano.config.floatX - ), - name='b', - borrow=True - ) - - # symbolic expression for computing the matrix of class-membership - # probabilities - # Where: - # W is a matrix where column-k represent the separation hyper plain for - # class-k - # x is a matrix where row-j represents input training sample-j - # b is a vector where element-k represent the free parameter of hyper - # plain-k - self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) - - # symbolic description of how to compute prediction as class whose - # probability is maximal - self.y_pred = T.argmax(self.p_y_given_x, axis=1) - # end-snippet-1 - - # parameters of the model - self.params = [self.W, self.b] - - def neg_ll(self, y): - """Return the mean of the negative log-likelihood of the prediction - of this model under a given target distribution. - - .. math:: - - \frac{1}{|\mathcal{D}|} \mathcal{L} (\theta=\{W,b\}, \mathcal{D}) = - \frac{1}{|\mathcal{D}|} \sum_{i=0}^{|\mathcal{D}|} - \log(P(Y=y^{(i)}|x^{(i)}, W,b)) \\ - \ell (\theta=\{W,b\}, \mathcal{D}) - - :type y: theano.tensor.TensorType - :param y: corresponds to a vector that gives for each example the - correct label - - Note: we use the mean instead of the sum so that - the learning rate is less dependent on the batch size - """ - # start-snippet-2 - # y.shape[0] is (symbolically) the number of rows in y, i.e., - # number of examples (call it n) in the minibatch - # T.arange(y.shape[0]) is a symbolic vector which will contain - # [0,1,2,... n-1] T.log(self.p_y_given_x) is a matrix of - # Log-Probabilities (call it LP) with one row per example and - # one column per class LP[T.arange(y.shape[0]),y] is a vector - # v containing [LP[0,y[0]], LP[1,y[1]], LP[2,y[2]], ..., - # LP[n-1,y[n-1]]] and T.mean(LP[T.arange(y.shape[0]),y]) is - # the mean (across minibatch examples) of the elements in v, - # i.e., the mean log-likelihood across the minibatch. - return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) - # end-snippet-2 - - def errors(self, y): - """Return a float representing the number of errors in the minibatch - over the total number of examples of the minibatch ; zero one - loss over the size of the minibatch - - :type y: theano.tensor.TensorType - :param y: corresponds to a vector that gives for each example the - correct label - """ - - # check if y has same dimension of y_pred - if y.ndim != self.y_pred.ndim: - raise TypeError( - 'y should have the same shape as self.y_pred', - ('y', y.type, 'y_pred', self.y_pred.type) - ) - # check if y is of the correct datatype - if y.dtype.startswith('int'): - # the T.neq operator returns a vector of 0s and 1s, where 1 - # represents a mistake in prediction - return T.mean(T.neq(self.y_pred, y)) - else: - raise NotImplementedError() - - -# start-snippet-1 -class HiddenLayer(object): - def __init__(self, rng, input, n_in, n_out, W=None, b=None, - activation=T.tanh): - """ - Typical hidden layer of a MLP: units are fully-connected and have - sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) - and the bias vector b is of shape (n_out,). - - NOTE : The nonlinearity used here is tanh - - Hidden unit activation is given by: tanh(dot(input,W) + b) - - :type rng: numpy.random.RandomState - :param rng: a random number generator used to initialize weights - - :type input: theano.tensor.dmatrix - :param input: a symbolic tensor of shape (n_examples, n_in) - - :type n_in: int - :param n_in: dimensionality of input - - :type n_out: int - :param n_out: number of hidden units - - :type activation: theano.Op or function - :param activation: Non linearity to be applied in the hidden - layer - """ - self.input = input - # end-snippet-1 - - # `W` is initialized with `W_values` which is uniformely sampled - # from sqrt(-6./(n_in+n_hidden)) and sqrt(6./(n_in+n_hidden)) - # for tanh activation function - # the output of uniform if converted using asarray to dtype - # theano.config.floatX so that the code is runable on GPU - # Note : optimal initialization of weights is dependent on the - # activation function used (among other things). - # For example, results presented in [Xavier10] suggest that you - # should use 4 times larger initial weights for sigmoid - # compared to tanh - # We have no info for other function, so we use the same as - # tanh. - if W is None: - W_values = numpy.asarray( - rng.uniform( - low=-numpy.sqrt(6. / (n_in + n_out)), - high=numpy.sqrt(6. / (n_in + n_out)), - size=(n_in, n_out) - ), - dtype=theano.config.floatX - ) - if activation == theano.tensor.nnet.sigmoid: - W_values *= 4 - - W = theano.shared(value=W_values, name='W', borrow=True) - - if b is None: - b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) - b = theano.shared(value=b_values, name='b', borrow=True) - - self.W = W - self.b = b - - lin_output = T.dot(input, self.W) + self.b - self.output = ( - lin_output if activation is None - else activation(lin_output) - ) - # parameters of the model - self.params = [self.W, self.b] - - -# start-snippet-2 -class MLP(object): - """Multi-Layer Perceptron Class - - A multilayer perceptron is a feedforward artificial neural network model - that has one layer or more of hidden units and nonlinear activations. - Intermediate layers usually have as activation function tanh or the - sigmoid function (defined here by a ``HiddenLayer`` class) while the - top layer is a softmax layer (defined here by a ``LogisticRegression`` - class). - """ - - def __init__(self, rng, input, n_in, n_hidden, n_out): - """Initialize the parameters for the multilayer perceptron - - :type rng: numpy.random.RandomState - :param rng: a random number generator used to initialize weights - - :type input: theano.tensor.TensorType - :param input: symbolic variable that describes the input of the - architecture (one minibatch) - - :type n_in: int - :param n_in: number of input units, the dimension of the space in - which the datapoints lie - - :type n_hidden: int - :param n_hidden: number of hidden units - - :type n_out: int - :param n_out: number of output units, the dimension of the space in - which the labels lie - - """ - - # Since we are dealing with a one hidden layer MLP, this will translate - # into a HiddenLayer with a tanh activation function connected to the - # LogisticRegression layer; the activation function can be replaced by - # sigmoid or any other nonlinear function - self.hidden = HiddenLayer( - rng=rng, - input=input, - n_in=n_in, - n_out=n_hidden, - activation=T.tanh - ) - - # The logistic regression layer gets as input the hidden units - # of the hidden layer - self.maxent = LogisticRegression( - input=self.hidden.output, - n_in=n_hidden, - n_out=n_out - ) - # L1 norm ; one regularization option is to enforce L1 norm to - # be small - self.L1 = abs(self.hidden.W).sum() + abs(self.maxent.W).sum() - - # square of L2 norm ; one regularization option is to enforce - # square of L2 norm to be small - self.L2_sqr = (self.hidden.W ** 2).sum() + (self.maxent.W ** 2).sum() - - # negative log likelihood of the MLP is given by the negative - # log likelihood of the output of the model, computed in the - # logistic regression layer - self.neg_ll = self.maxent.neg_ll - # same holds for the function computing the number of errors - self.errors = self.maxent.errors - - # the parameters of the model are the parameters of the two layer it is - # made out of - self.params = self.hidden.params + self.maxent.params - - - - -def test_mlp(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, - dataset='mnist.pkl.gz', batch_size=1, n_hidden=500): - """ - Demonstrate stochastic gradient descent optimization for a multilayer - perceptron - - This is demonstrated on MNIST. - - :type learning_rate: float - :param learning_rate: learning rate used (factor for the stochastic - gradient - - :type L1_reg: float - :param L1_reg: L1-norm's weight when added to the cost (see - regularization) - - :type L2_reg: float - :param L2_reg: L2-norm's weight when added to the cost (see - regularization) - - :type n_epochs: int - :param n_epochs: maximal number of epochs to run the optimizer - - :type dataset: string - :param dataset: the path of the MNIST dataset file from - http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz - """ - datasets = load_data(dataset) - - train_set_x, train_set_y = datasets[0] - valid_set_x, valid_set_y = datasets[1] - test_set_x, test_set_y = datasets[2] - - ###################### - # BUILD ACTUAL MODEL # - ###################### - print '... building the model' - - # allocate symbolic variables for the data - index = T.lscalar() # index to a [mini]batch - x = T.matrix('x') # the data is presented as rasterized images - y = T.ivector('y') # the labels are presented as 1D vector of - # [int] labels - - rng = numpy.random.RandomState(1234) - - # construct the MLP class - mlp = MLP( - rng=rng, - input=x, - n_in=28 * 28, - n_hidden=n_hidden, - n_out=10 - ) - - # the cost we minimize during training is the negative log likelihood of - # the model plus the regularization terms (L1 and L2); cost is expressed - # here symbolically - - # compiling a Theano function that computes the mistakes that are made - # by the model on a minibatch - test_model = theano.function( - inputs=[index], - outputs=mlp.maxent.errors(y), - givens={ - x: test_set_x[index:index+1], - y: test_set_y[index:index+1] - } - ) - - validate_model = theano.function( - inputs=[index], - outputs=mlp.maxent.errors(y), - givens={ - x: valid_set_x[index:index+1], - y: valid_set_y[index:index+1] - } - ) - - # compute the gradient of cost with respect to theta (sotred in params) - # the resulting gradients will be stored in a list gparams - cost = mlp.neg_ll(y) + L1_reg * mlp.L1 + L2_reg * mlp.L2_sqr - gparams = [T.grad(cost, param) for param in mlp.params] - - # specify how to update the parameters of the model as a list of - # (variable, update expression) pairs - - updates = [(mlp.params[i], mlp.params[i] - (learning_rate * gparams[i])) - for i in xrange(len(gparams))] - - # compiling a Theano function `train_model` that returns the cost, but - # in the same time updates the parameter of the model based on the rules - # defined in `updates` - train_model = theano.function( - inputs=[index], - outputs=cost, - updates=updates, - givens={ - x: train_set_x[index:index+1], - y: train_set_y[index:index+1] - } - ) - # end-snippet-5 - - ############### - # TRAIN MODEL # - ############### - print '... training' - - start_time = time.clock() - - n_examples = train_set_x.get_value(borrow=True).shape[0] - n_dev_examples = valid_set_x.get_value(borrow=True).shape[0] - n_test_examples = test_set_x.get_value(borrow=True).shape[0] - - for epoch in range(1, n_epochs+1): - for idx in xrange(n_examples): - train_model(idx) - # compute zero-one loss on validation set - error = numpy.mean(map(validate_model, xrange(n_dev_examples))) - print('epoch %i, validation error %f %%' % (epoch, error * 100)) - - end_time = time.clock() - print >> sys.stderr, ('The code for file ' + - os.path.split(__file__)[1] + - ' ran for %.2fm' % ((end_time - start_time) / 60.)) - - -if __name__ == '__main__': - test_mlp()