Fix initialization of vectors, to address serialization problem

2017-10-20 13:59:24 +02:00 · 2017-10-20 13:59:24 +02:00 · 92ac9316b5
parent 61bc203f3f
commit 92ac9316b5
2 changed files with 9 additions and 13 deletions
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -32,22 +32,20 @@ cdef class Vectors:
    cdef public object keys
    cdef public int i
-    def __init__(self, strings, data_or_width=0):
+    def __init__(self, strings, data=None, width=0):
        if isinstance(strings, StringStore):
            self.strings = strings
        else:
            self.strings = StringStore()
            for string in strings:
                self.strings.add(string)
-        if isinstance(data_or_width, int):
+        if data is not None:
-            self.data = data = numpy.zeros((len(strings), data_or_width),
+            self.data = numpy.asarray(data, dtype='f')
                                           dtype='f')
        else:
-            data = data_or_width
+            self.data = numpy.zeros((len(self.strings), width), dtype='f')
        self.i = 0
        self.data = data
        self.key2row = {}
-        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
+        self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
    def __reduce__(self):
        return (Vectors, (self.strings, self.data))
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -62,12 +62,10 @@ cdef class Vocab:
        if strings:
            for string in strings:
                _ = self[string]
        for name in tag_map.keys():
            if name:
                self.strings.add(name)
        self.lex_attr_getters = lex_attr_getters
        print("Create morphology", list(self.strings), tag_map)
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings)
+        self.vectors = Vectors(self.strings, width=0)
    property lang:
        def __get__(self):
@ -338,7 +336,7 @@ cdef class Vocab:
            if self.vectors is None:
                return None
            else:
-                return self.vectors.to_bytes(exclude='strings.json')
+                return self.vectors.to_bytes()
        getters = OrderedDict((
            ('strings', lambda: self.strings.to_bytes()),
@ -358,7 +356,7 @@ cdef class Vocab:
            if self.vectors is None:
                return None
            else:
-                return self.vectors.from_bytes(b, exclude='strings')
+                return self.vectors.from_bytes(b)
        setters = OrderedDict((
            ('strings', lambda b: self.strings.from_bytes(b)),
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),