diff --git a/requirements.txt b/requirements.txt index fa1a3e6d3..aae0f9388 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pathlib numpy>=1.7 cymem>=1.30,<1.32 preshed>=1.0.0,<2.0.0 -thinc>=6.8.1,<6.9.0 +thinc>=6.8.0,<6.9.0 murmurhash>=0.28,<0.29 plac<1.0.0,>=0.9.6 six diff --git a/setup.py b/setup.py index 73c38b14a..ec114c94e 100755 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ MOD_NAMES = [ 'spacy.syntax.stateclass', 'spacy.syntax._state', 'spacy.tokenizer', + 'spacy._cfile', 'spacy.syntax.parser', 'spacy.syntax.nn_parser', 'spacy.syntax.beam_parser', @@ -193,7 +194,7 @@ def setup_package(): 'murmurhash>=0.28,<0.29', 'cymem>=1.30,<1.32', 'preshed>=1.0.0,<2.0.0', - 'thinc>=6.8.1,<6.9.0', + 'thinc>=6.8.0,<6.9.0', 'plac<1.0.0,>=0.9.6', 'pip>=9.0.0,<10.0.0', 'six', diff --git a/spacy/_cfile.pxd b/spacy/_cfile.pxd new file mode 100644 index 000000000..cb0077587 --- /dev/null +++ b/spacy/_cfile.pxd @@ -0,0 +1,26 @@ +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from cymem.cymem cimport Pool + +cdef class CFile: + cdef FILE* fp + cdef bint is_open + cdef Pool mem + cdef int size # For compatibility with subclass + cdef int _capacity # For compatibility with subclass + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * + + + +cdef class StringCFile(CFile): + cdef unsigned char* data + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/spacy/_cfile.pyx b/spacy/_cfile.pyx new file mode 100644 index 000000000..ceebe2e59 --- /dev/null +++ b/spacy/_cfile.pyx @@ -0,0 +1,88 @@ +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from libc.string cimport memcpy + + +cdef class CFile: + def __init__(self, loc, mode, on_open_error=None): + if isinstance(mode, unicode): + mode_str = mode.encode('ascii') + else: + mode_str = mode + if hasattr(loc, 'as_posix'): + loc = loc.as_posix() + self.mem = Pool() + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self.fp = fopen(bytes_loc, mode_str) + if self.fp == NULL: + if on_open_error is not None: + on_open_error() + else: + raise IOError("Could not open binary file %s" % bytes_loc) + self.is_open = True + + def __dealloc__(self): + if self.is_open: + fclose(self.fp) + + def close(self): + fclose(self.fp) + self.is_open = False + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: + st = fread(dest, elem_size, number, self.fp) + if st != number: + raise IOError + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: + st = fwrite(src, elem_size, number, self.fp) + if st != number: + raise IOError + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: + cdef void* dest = mem.alloc(number, elem_size) + self.read_into(dest, number, elem_size) + return dest + + def write_unicode(self, unicode value): + cdef bytes py_bytes = value.encode('utf8') + cdef char* chars = py_bytes + self.write(sizeof(char), len(py_bytes), chars) + + +cdef class StringCFile: + def __init__(self, mode, bytes data=b'', on_open_error=None): + self.mem = Pool() + self.is_open = 'w' in mode + self._capacity = max(len(data), 8) + self.size = len(data) + self.data = self.mem.alloc(1, self._capacity) + for i in range(len(data)): + self.data[i] = data[i] + + def close(self): + self.is_open = False + + def string_data(self): + return (self.data-self.size)[:self.size] + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: + memcpy(dest, self.data, elem_size * number) + self.data += elem_size * number + + cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: + write_size = number * elem_size + if (self.size + write_size) >= self._capacity: + self._capacity = (self.size + write_size) * 2 + self.data = self.mem.realloc(self.data, self._capacity) + memcpy(&self.data[self.size], src, elem_size * number) + self.size += write_size + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: + cdef void* dest = mem.alloc(number, elem_size) + self.read_into(dest, number, elem_size) + return dest + + def write_unicode(self, unicode value): + cdef bytes py_bytes = value.encode('utf8') + cdef char* chars = py_bytes + self.write(sizeof(char), len(py_bytes), chars) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 9e7cb9a0e..ca238774a 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -37,14 +37,11 @@ from preshed.maps cimport MapStruct from preshed.maps cimport map_get from thinc.api import layerize, chain, noop, clone -<<<<<<< HEAD from thinc.neural import Model, Affine, ELU, ReLu, Maxout -======= from thinc.neural import Model, Affine, ReLu, Maxout from thinc.neural._classes.batchnorm import BatchNorm as BN from thinc.neural._classes.selu import SELU from thinc.neural._classes.layernorm import LayerNorm ->>>>>>> feature/nn-beam-parser from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.util import get_array_module @@ -54,6 +51,7 @@ from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import Tok2Vec, doc2feats, rebatch from ..compat import json_dumps +from . import _beam_utils from . import _parse_features from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context @@ -68,10 +66,6 @@ from ..strings cimport StringStore from ..gold cimport GoldParse from ..attrs cimport TAG, DEP -<<<<<<< HEAD -======= -USE_FINE_TUNE = True ->>>>>>> feature/nn-beam-parser def get_templates(*args, **kwargs): return [] @@ -259,7 +253,6 @@ cdef class Parser: nI=token_vector_width) with Model.use_device('cpu'): -<<<<<<< HEAD if depth == 0: upper = chain() upper.is_noop = True @@ -269,12 +262,6 @@ cdef class Parser: zero_init(Affine(nr_class, drop_factor=0.0)) ) upper.is_noop = False -======= - upper = chain( - clone(Maxout(hidden_width), (depth-1)), - zero_init(Affine(nr_class, drop_factor=0.0)) - ) ->>>>>>> feature/nn-beam-parser # TODO: This is an unfortunate hack atm! # Used to set input dimensions in network. lower.begin_training(lower.ops.allocate((500, token_vector_width))) @@ -422,7 +409,6 @@ cdef class Parser: c_is_valid = is_valid.data cdef int has_hidden = not getattr(vec2scores, 'is_noop', False) while not next_step.empty(): -<<<<<<< HEAD if not has_hidden: for i in cython.parallel.prange( next_step.size(), num_threads=6, nogil=True): @@ -442,21 +428,6 @@ cdef class Parser: &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) action = self.moves.c[guess] action.do(st, action.label) -======= - for i in range(next_step.size()): - st = next_step[i] - st.set_context_tokens(&c_token_ids[i*nr_feat], nr_feat) - self.moves.set_valid(&c_is_valid[i*nr_class], st) - vectors = state2vec(token_ids[:next_step.size()]) - scores = vec2scores(vectors) - c_scores = scores.data - for i in range(next_step.size()): - st = next_step[i] - guess = arg_max_if_valid( - &c_scores[i*nr_class], &c_is_valid[i*nr_class], nr_class) - action = self.moves.c[guess] - action.do(st, action.label) ->>>>>>> feature/nn-beam-parser this_step, next_step = next_step, this_step next_step.clear() for st in this_step: @@ -526,17 +497,17 @@ cdef class Parser: free(token_ids) def update(self, docs_tokvecs, golds, drop=0., sgd=None, losses=None): -<<<<<<< HEAD -======= if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.5: return self.update_beam(docs_tokvecs, golds, self.cfg['beam_width'], self.cfg['beam_density'], drop=drop, sgd=sgd, losses=losses) ->>>>>>> feature/nn-beam-parser if losses is not None and self.name not in losses: losses[self.name] = 0. docs, tokvec_lists = docs_tokvecs tokvecs = self.model[0].ops.flatten(tokvec_lists) + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + tokvecs += self.model[0].ops.flatten(my_tokvecs) + if isinstance(docs, Doc) and isinstance(golds, GoldParse): docs = [docs] golds = [golds] @@ -589,12 +560,8 @@ cdef class Parser: break self._make_updates(d_tokvecs, backprops, sgd, cuda_stream) -<<<<<<< HEAD - return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) -======= d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) - if USE_FINE_TUNE: - bp_my_tokvecs(d_tokvecs, sgd=sgd) + bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs def update_beam(self, docs_tokvecs, golds, width=None, density=None, @@ -609,10 +576,9 @@ cdef class Parser: lengths = [len(d) for d in docs] assert min(lengths) >= 1 tokvecs = self.model[0].ops.flatten(tokvecs) - if USE_FINE_TUNE: - my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) - my_tokvecs = self.model[0].ops.flatten(my_tokvecs) - tokvecs += my_tokvecs + my_tokvecs, bp_my_tokvecs = self.model[0].begin_update(docs_tokvecs, drop=drop) + my_tokvecs = self.model[0].ops.flatten(my_tokvecs) + tokvecs += my_tokvecs states = self.moves.init_batch(docs) for gold in golds: @@ -643,10 +609,8 @@ cdef class Parser: d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) self._make_updates(d_tokvecs, backprop_lower, sgd, cuda_stream) d_tokvecs = self.model[0].ops.unflatten(d_tokvecs, lengths) - if USE_FINE_TUNE: - bp_my_tokvecs(d_tokvecs, sgd=sgd) + bp_my_tokvecs(d_tokvecs, sgd=sgd) return d_tokvecs ->>>>>>> feature/nn-beam-parser def _init_gold_batch(self, whole_docs, whole_golds): """Make a square batch, of length equal to the shortest doc. A long @@ -691,21 +655,10 @@ cdef class Parser: xp = get_array_module(d_tokvecs) for ids, d_vector, bp_vector in backprops: d_state_features = bp_vector(d_vector, sgd=sgd) -<<<<<<< HEAD - active_feats = ids * (ids >= 0) - active_feats = active_feats.reshape((ids.shape[0], ids.shape[1], 1)) - if hasattr(xp, 'scatter_add'): - xp.scatter_add(d_tokvecs, - ids, d_state_features * active_feats) - else: - xp.add.at(d_tokvecs, - ids, d_state_features * active_feats) -======= mask = ids >= 0 d_state_features *= mask.reshape(ids.shape + (1,)) self.model[0].ops.scatter_add(d_tokvecs, ids * mask, d_state_features) ->>>>>>> feature/nn-beam-parser @property def move_names(self): diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 35d4d17ab..59a24dfa9 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,18 +1,26 @@ +from libc.stdint cimport int32_t, uint64_t import numpy from collections import OrderedDict import msgpack import msgpack_numpy msgpack_numpy.patch() +from cymem.cymem cimport Pool +cimport numpy as np +from libcpp.vector cimport vector +from .typedefs cimport attr_t from .strings cimport StringStore from . import util +from ._cfile cimport CFile + +MAX_VEC_SIZE = 10000 cdef class Vectors: '''Store, save and load word vectors.''' cdef public object data cdef readonly StringStore strings - cdef public object key2i + cdef public object index def __init__(self, strings, data_or_width): self.strings = StringStore() @@ -22,9 +30,9 @@ cdef class Vectors: else: data = data_or_width self.data = data - self.key2i = {} + self.index = {} for i, string in enumerate(strings): - self.key2i[self.strings.add(string)] = i + self.index[self.strings.add(string)] = i def __reduce__(self): return (Vectors, (self.strings, self.data)) @@ -32,7 +40,7 @@ cdef class Vectors: def __getitem__(self, key): if isinstance(key, basestring): key = self.strings[key] - i = self.key2i[key] + i = self.index[key] if i is None: raise KeyError(key) else: @@ -41,7 +49,7 @@ cdef class Vectors: def __setitem__(self, key, vector): if isinstance(key, basestring): key = self.strings.add(key) - i = self.key2i[key] + i = self.index[key] self.data[i] = vector def __iter__(self): @@ -61,34 +69,119 @@ cdef class Vectors: def most_similar(self, key): raise NotImplementedError - def to_disk(self, path): - raise NotImplementedError + def to_disk(self, path, **exclude): + def serialize_vectors(p): + write_vectors_to_bin_loc(self.strings, self.key2i, self.data, str(p)) - def from_disk(self, path): - raise NotImplementedError + serializers = OrderedDict(( + ('vec.bin', serialize_vectors), + )) + return util.to_disk(serializers, exclude) + + def from_disk(self, path, **exclude): + def deserialize_vectors(p): + self.key2i, self.vectors = load_vectors_from_bin_loc(self.strings, str(p)) + + serializers = OrderedDict(( + ('vec.bin', deserialize_vectors) + )) + return util.to_disk(serializers, exclude) def to_bytes(self, **exclude): def serialize_weights(): - if hasattr(self.weights, 'to_bytes'): - return self.weights.to_bytes() + if hasattr(self.data, 'to_bytes'): + return self.data.to_bytes() else: - return msgpack.dumps(self.weights) + return msgpack.dumps(self.data) serializers = OrderedDict(( + ('key2row', lambda: msgpack.dumps(self.key2i)), ('strings', lambda: self.strings.to_bytes()), - ('weights', serialize_weights) + ('vectors', serialize_weights) )) return util.to_bytes(serializers, exclude) def from_bytes(self, data, **exclude): def deserialize_weights(b): - if hasattr(self.weights, 'from_bytes'): - self.weights.from_bytes() + if hasattr(self.data, 'from_bytes'): + self.data.from_bytes() else: - self.weights = msgpack.loads(b) + self.data = msgpack.loads(b) deserializers = OrderedDict(( + ('key2row', lambda b: self.key2i.update(msgpack.loads(b))), ('strings', lambda b: self.strings.from_bytes(b)), - ('weights', deserialize_weights) + ('vectors', deserialize_weights) )) return util.from_bytes(deserializers, exclude) + + +def write_vectors_to_bin_loc(StringStore strings, dict key2i, + np.ndarray vectors, out_loc): + + cdef int32_t vec_len = vectors.shape[1] + cdef int32_t word_len + cdef bytes word_str + cdef char* chars + cdef uint64_t key + cdef int32_t i + cdef float* vec + + cdef CFile out_file = CFile(out_loc, 'wb') + keys = [(i, key) for (key, i) in key2i.item()] + keys.sort() + for i, key in keys: + vec = vectors.data[i * vec_len] + word_str = strings[key].encode('utf8') + word_len = len(word_str) + + out_file.write_from(&word_len, 1, sizeof(word_len)) + out_file.write_from(&vec_len, 1, sizeof(vec_len)) + + chars = word_str + out_file.write_from(chars, word_len, sizeof(char)) + out_file.write_from(vec, vec_len, sizeof(float)) + out_file.close() + + +def load_vectors_from_bin_loc(StringStore strings, loc): + """ + Load vectors from the location of a binary file. + Arguments: + loc (unicode): The path of the binary file to load from. + Returns: + vec_len (int): The length of the vectors loaded. + """ + cdef CFile file_ = CFile(loc, b'rb') + cdef int32_t word_len + cdef int32_t vec_len = 0 + cdef int32_t prev_vec_len = 0 + cdef float* vec + cdef attr_t string_id + cdef bytes py_word + cdef vector[float*] vectors + cdef int line_num = 0 + cdef Pool mem = Pool() + cdef dict key2i = {} + while True: + try: + file_.read_into(&word_len, sizeof(word_len), 1) + except IOError: + break + file_.read_into(&vec_len, sizeof(vec_len), 1) + if prev_vec_len != 0 and vec_len != prev_vec_len: + raise Exception("Mismatched vector sizes") + if 0 >= vec_len >= MAX_VEC_SIZE: + raise Exception("Mismatched vector sizes") + + chars = file_.alloc_read(mem, word_len, sizeof(char)) + vec = file_.alloc_read(mem, vec_len, sizeof(float)) + + key = strings.add(chars[:word_len]) + key2i[key] = vectors.size() + vectors.push_back(vec) + numpy_vectors = numpy.zeros((vectors.size(), vec_len), dtype='f') + for i in range(vectors.size()): + for j in range(vec_len): + numpy_vectors[i, j] = vectors[i][j] + return key2i, numpy_vectors diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 149317779..5909872d6 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -280,7 +280,7 @@ cdef class Vocab: or int ID.""" return False - def to_disk(self, path): + def to_disk(self, path, **exclude): """Save the current state to a directory. path (unicode or Path): A path to a directory, which will be created if @@ -292,8 +292,10 @@ cdef class Vocab: self.strings.to_disk(path / 'strings.json') with (path / 'lexemes.bin').open('wb') as file_: file_.write(self.lexemes_to_bytes()) + if self.vectors is not None: + self.vectors.to_disk(path, exclude='strings.json') - def from_disk(self, path): + def from_disk(self, path, **exclude): """Loads state from a directory. Modifies the object in place and returns it. @@ -305,6 +307,8 @@ cdef class Vocab: self.strings.from_disk(path / 'strings.json') with (path / 'lexemes.bin').open('rb') as file_: self.lexemes_from_bytes(file_.read()) + if self.vectors is not None: + self.vectors.from_disk(path, exclude='string.json') return self def to_bytes(self, **exclude): @@ -313,9 +317,16 @@ cdef class Vocab: **exclude: Named attributes to prevent from being serialized. RETURNS (bytes): The serialized form of the `Vocab` object. """ + def deserialize_vectors(): + if self.vectors is None: + return None + else: + return self.vectors.to_bytes(exclude='strings') + getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), ('lexemes', lambda: self.lexemes_to_bytes()), + ('vectors', deserialize_vectors) )) return util.to_bytes(getters, exclude) @@ -326,9 +337,15 @@ cdef class Vocab: **exclude: Named attributes to prevent from being loaded. RETURNS (Vocab): The `Vocab` object. """ + def serialize_vectors(b): + if self.vectors is None: + return None + else: + return self.vectors.from_bytes(b, exclude='strings') setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), + ('vectors', lambda b: serialize_vectors(b)) )) util.from_bytes(bytes_data, setters, exclude) return self