Remove vectors from vocab

2017-05-28 11:45:32 +02:00 · 2017-05-28 11:45:32 +02:00 · 15f6efc127
parent 48eef94f92
commit 15f6efc127
1 changed files with 20 additions and 198 deletions
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -26,15 +26,6 @@ from . import attrs
 from . import symbols


-DEF MAX_VEC_SIZE = 100000
-
-
-cdef float[MAX_VEC_SIZE] EMPTY_VEC
-memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
-memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
-EMPTY_LEXEME.vector = EMPTY_VEC
-
-
 cdef class Vocab:
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
@ -179,7 +170,6 @@ cdef class Vocab:
        lex.orth = self.strings[string]
        lex.length = len(string)
        lex.id = self.length
-        lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
        if self.lex_attr_getters is not None:
            for attr, func in self.lex_attr_getters.items():
                value = func(string)
@ -258,6 +248,26 @@ cdef class Vocab:
                Token.set_struct_attr(token, attr_id, value)
        return tokens

+    def get_vector(self, orth):
+        """Retrieve a vector for a word in the vocabulary.
+
+        Words can be looked up by string or int ID.
+
+        RETURNS:
+            A word vector. Size and shape determed by the
+            vocab.vectors instance. Usually, a numpy ndarray
+            of shape (300,) and dtype float32.
+
+        RAISES: If no vectors data is loaded, ValueError is raised.
+        """
+        raise NotImplementedError
+
+    def has_vector(self, orth):
+        """Check whether a word has a vector. Returns False if no
+        vectors have been loaded. Words can be looked up by string
+        or int ID."""
+        raise NotImplementedError
+
    def to_disk(self, path):
        """Save the current state to a directory.

@ -271,9 +281,6 @@ cdef class Vocab:
        with strings_loc.open('w', encoding='utf8') as file_:
            self.strings.dump(file_)

-        # TODO: pickle
-        # self.dump(path / 'lexemes.bin')
-
    def from_disk(self, path):
        """Loads state from a directory. Modifies the object in place and
        returns it.
@ -346,7 +353,6 @@ cdef class Vocab:
                lex_data.data[j] = bytes_ptr[i+j]
            Lexeme.c_from_bytes(lexeme, lex_data)

-            lexeme.vector = EMPTY_VEC
            py_str = self.strings[lexeme.orth]
            assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
            key = hash_string(py_str)
@ -354,172 +360,6 @@ cdef class Vocab:
            self._by_orth.set(lexeme.orth, lexeme)
            self.length += 1

-    # Deprecated --- delete these once stable
-
-    def dump_vectors(self, out_loc):
-        """Save the word vectors to a binary file.
-
-        loc (Path): The path to save to.
-        """
-        cdef int32_t vec_len = self.vectors_length
-        cdef int32_t word_len
-        cdef bytes word_str
-        cdef char* chars
-
-        cdef Lexeme lexeme
-        cdef CFile out_file = CFile(out_loc, 'wb')
-        for lexeme in self:
-            word_str = lexeme.orth_.encode('utf8')
-            vec = lexeme.c.vector
-            word_len = len(word_str)
-
-            out_file.write_from(&word_len, 1, sizeof(word_len))
-            out_file.write_from(&vec_len, 1, sizeof(vec_len))
-
-            chars = <char*>word_str
-            out_file.write_from(chars, word_len, sizeof(char))
-            out_file.write_from(vec, vec_len, sizeof(float))
-        out_file.close()
-
-
-
-    def load_vectors(self, file_):
-        """Load vectors from a text-based file.
-
-        file_ (buffer): The file to read from. Entries should be separated by
-            newlines, and each entry should be whitespace delimited. The first value of the entry
-            should be the word string, and subsequent entries should be the values of the
-            vector.
-
-        RETURNS (int): The length of the vectors loaded.
-        """
-        cdef LexemeC* lexeme
-        cdef attr_t orth
-        cdef int32_t vec_len = -1
-        cdef double norm = 0.0
-
-        whitespace_pattern = re.compile(r'\s', re.UNICODE)
-
-        for line_num, line in enumerate(file_):
-            pieces = line.split()
-            word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
-            if vec_len == -1:
-                vec_len = len(pieces)
-            elif vec_len != len(pieces):
-                raise VectorReadError.mismatched_sizes(file_, line_num,
-                                                        vec_len, len(pieces))
-            orth = self.strings[word_str]
-            lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
-            lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
-            for i, val_str in enumerate(pieces):
-                lexeme.vector[i] = float(val_str)
-            norm = 0.0
-            for i in range(vec_len):
-                norm += lexeme.vector[i] * lexeme.vector[i]
-            lexeme.l2_norm = sqrt(norm)
-        self.vectors_length = vec_len
-        return vec_len
-
-    def load_vectors_from_bin_loc(self, loc):
-        """Load vectors from the location of a binary file.
-
-        loc (unicode): The path of the binary file to load from.
-
-        RETURNS (int): The length of the vectors loaded.
-        """
-        cdef CFile file_ = CFile(loc, b'rb')
-        cdef int32_t word_len
-        cdef int32_t vec_len = 0
-        cdef int32_t prev_vec_len = 0
-        cdef float* vec
-        cdef Address mem
-        cdef attr_t string_id
-        cdef bytes py_word
-        cdef vector[float*] vectors
-        cdef int line_num = 0
-        cdef Pool tmp_mem = Pool()
-        while True:
-            try:
-                file_.read_into(&word_len, sizeof(word_len), 1)
-            except IOError:
-                break
-            file_.read_into(&vec_len, sizeof(vec_len), 1)
-            if prev_vec_len != 0 and vec_len != prev_vec_len:
-                raise VectorReadError.mismatched_sizes(loc, line_num,
-                                                       vec_len, prev_vec_len)
-            if 0 >= vec_len >= MAX_VEC_SIZE:
-                raise VectorReadError.bad_size(loc, vec_len)
-
-            chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
-            vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
-
-            string_id = self.strings[chars[:word_len]]
-            # Insert words into vocab to add vector.
-            self.get_by_orth(self.mem, string_id)
-            while string_id >= vectors.size():
-                vectors.push_back(EMPTY_VEC)
-            assert vec != NULL
-            vectors[string_id] = vec
-            line_num += 1
-        cdef LexemeC* lex
-        cdef size_t lex_addr
-        cdef double norm = 0.0
-        cdef int i
-        for orth, lex_addr in self._by_orth.items():
-            lex = <LexemeC*>lex_addr
-            if lex.lower < vectors.size():
-                lex.vector = vectors[lex.lower]
-                norm = 0.0
-                for i in range(vec_len):
-                    norm += lex.vector[i] * lex.vector[i]
-                lex.l2_norm = sqrt(norm)
-            else:
-                lex.vector = EMPTY_VEC
-        self.vectors_length = vec_len
-        return vec_len
-
-
-    def resize_vectors(self, int new_size):
-        """Set vectors_length to a new size, and allocate more memory for the
-        `Lexeme` vectors if necessary. The memory will be zeroed.
-
-        new_size (int): The new size of the vectors.
-        """
-        cdef hash_t key
-        cdef size_t addr
-        if new_size > self.vectors_length:
-            for key, addr in self._by_hash.items():
-                lex = <LexemeC*>addr
-                lex.vector = <float*>self.mem.realloc(lex.vector,
-                                        new_size * sizeof(lex.vector[0]))
-        self.vectors_length = new_size
-
-
-def write_binary_vectors(in_loc, out_loc):
-    cdef CFile out_file = CFile(out_loc, 'wb')
-    cdef Address mem
-    cdef int32_t word_len
-    cdef int32_t vec_len
-    cdef char* chars
-    with bz2.BZ2File(in_loc, 'r') as file_:
-        for line in file_:
-            pieces = line.split()
-            word = pieces.pop(0)
-            mem = Address(len(pieces), sizeof(float))
-            vec = <float*>mem.ptr
-            for i, val_str in enumerate(pieces):
-                vec[i] = float(val_str)
-
-            word_len = len(word)
-            vec_len = len(pieces)
-
-            out_file.write_from(&word_len, 1, sizeof(word_len))
-            out_file.write_from(&vec_len, 1, sizeof(vec_len))
-
-            chars = <char*>word
-            out_file.write_from(chars, len(word), sizeof(char))
-            out_file.write_from(vec, vec_len, sizeof(float))
-

 def pickle_vocab(vocab):
    sstore = vocab.strings
@ -567,21 +407,3 @@ class LookupError(Exception):
            "ID of orth: {orth_id}".format(
                query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
        )
-
-
-class VectorReadError(Exception):
-    @classmethod
-    def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
-        return cls(
-            "Error reading word vectors from %s on line %d.\n"
-            "All vectors must be the same size.\n"
-            "Prev size: %d\n"
-            "Curr size: %d" % (loc, line_num, prev_size, curr_size))
-
-    @classmethod
-    def bad_size(cls, loc, size):
-        return cls(
-            "Error reading word vectors from %s.\n"
-            "Vector size: %d\n"
-            "Max size: %d\n"
-            "Min size: 1\n" % (loc, size, MAX_VEC_SIZE))