mirror of https://github.com/explosion/spaCy.git
Remove vectors from vocab
This commit is contained in:
parent
48eef94f92
commit
15f6efc127
218
spacy/vocab.pyx
218
spacy/vocab.pyx
|
@ -26,15 +26,6 @@ from . import attrs
|
||||||
from . import symbols
|
from . import symbols
|
||||||
|
|
||||||
|
|
||||||
DEF MAX_VEC_SIZE = 100000
|
|
||||||
|
|
||||||
|
|
||||||
cdef float[MAX_VEC_SIZE] EMPTY_VEC
|
|
||||||
memset(EMPTY_VEC, 0, sizeof(EMPTY_VEC))
|
|
||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|
||||||
EMPTY_LEXEME.vector = EMPTY_VEC
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
|
||||||
instance also provides access to the `StringStore`, and owns underlying
|
instance also provides access to the `StringStore`, and owns underlying
|
||||||
|
@ -179,7 +170,6 @@ cdef class Vocab:
|
||||||
lex.orth = self.strings[string]
|
lex.orth = self.strings[string]
|
||||||
lex.length = len(string)
|
lex.length = len(string)
|
||||||
lex.id = self.length
|
lex.id = self.length
|
||||||
lex.vector = <float*>mem.alloc(self.vectors_length, sizeof(float))
|
|
||||||
if self.lex_attr_getters is not None:
|
if self.lex_attr_getters is not None:
|
||||||
for attr, func in self.lex_attr_getters.items():
|
for attr, func in self.lex_attr_getters.items():
|
||||||
value = func(string)
|
value = func(string)
|
||||||
|
@ -258,6 +248,26 @@ cdef class Vocab:
|
||||||
Token.set_struct_attr(token, attr_id, value)
|
Token.set_struct_attr(token, attr_id, value)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
def get_vector(self, orth):
|
||||||
|
"""Retrieve a vector for a word in the vocabulary.
|
||||||
|
|
||||||
|
Words can be looked up by string or int ID.
|
||||||
|
|
||||||
|
RETURNS:
|
||||||
|
A word vector. Size and shape determed by the
|
||||||
|
vocab.vectors instance. Usually, a numpy ndarray
|
||||||
|
of shape (300,) and dtype float32.
|
||||||
|
|
||||||
|
RAISES: If no vectors data is loaded, ValueError is raised.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def has_vector(self, orth):
|
||||||
|
"""Check whether a word has a vector. Returns False if no
|
||||||
|
vectors have been loaded. Words can be looked up by string
|
||||||
|
or int ID."""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def to_disk(self, path):
|
def to_disk(self, path):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
@ -271,9 +281,6 @@ cdef class Vocab:
|
||||||
with strings_loc.open('w', encoding='utf8') as file_:
|
with strings_loc.open('w', encoding='utf8') as file_:
|
||||||
self.strings.dump(file_)
|
self.strings.dump(file_)
|
||||||
|
|
||||||
# TODO: pickle
|
|
||||||
# self.dump(path / 'lexemes.bin')
|
|
||||||
|
|
||||||
def from_disk(self, path):
|
def from_disk(self, path):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
returns it.
|
returns it.
|
||||||
|
@ -346,7 +353,6 @@ cdef class Vocab:
|
||||||
lex_data.data[j] = bytes_ptr[i+j]
|
lex_data.data[j] = bytes_ptr[i+j]
|
||||||
Lexeme.c_from_bytes(lexeme, lex_data)
|
Lexeme.c_from_bytes(lexeme, lex_data)
|
||||||
|
|
||||||
lexeme.vector = EMPTY_VEC
|
|
||||||
py_str = self.strings[lexeme.orth]
|
py_str = self.strings[lexeme.orth]
|
||||||
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
|
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
|
||||||
key = hash_string(py_str)
|
key = hash_string(py_str)
|
||||||
|
@ -354,172 +360,6 @@ cdef class Vocab:
|
||||||
self._by_orth.set(lexeme.orth, lexeme)
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
# Deprecated --- delete these once stable
|
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
|
||||||
"""Save the word vectors to a binary file.
|
|
||||||
|
|
||||||
loc (Path): The path to save to.
|
|
||||||
"""
|
|
||||||
cdef int32_t vec_len = self.vectors_length
|
|
||||||
cdef int32_t word_len
|
|
||||||
cdef bytes word_str
|
|
||||||
cdef char* chars
|
|
||||||
|
|
||||||
cdef Lexeme lexeme
|
|
||||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
|
||||||
for lexeme in self:
|
|
||||||
word_str = lexeme.orth_.encode('utf8')
|
|
||||||
vec = lexeme.c.vector
|
|
||||||
word_len = len(word_str)
|
|
||||||
|
|
||||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
|
||||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
|
||||||
|
|
||||||
chars = <char*>word_str
|
|
||||||
out_file.write_from(chars, word_len, sizeof(char))
|
|
||||||
out_file.write_from(vec, vec_len, sizeof(float))
|
|
||||||
out_file.close()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
|
||||||
"""Load vectors from a text-based file.
|
|
||||||
|
|
||||||
file_ (buffer): The file to read from. Entries should be separated by
|
|
||||||
newlines, and each entry should be whitespace delimited. The first value of the entry
|
|
||||||
should be the word string, and subsequent entries should be the values of the
|
|
||||||
vector.
|
|
||||||
|
|
||||||
RETURNS (int): The length of the vectors loaded.
|
|
||||||
"""
|
|
||||||
cdef LexemeC* lexeme
|
|
||||||
cdef attr_t orth
|
|
||||||
cdef int32_t vec_len = -1
|
|
||||||
cdef double norm = 0.0
|
|
||||||
|
|
||||||
whitespace_pattern = re.compile(r'\s', re.UNICODE)
|
|
||||||
|
|
||||||
for line_num, line in enumerate(file_):
|
|
||||||
pieces = line.split()
|
|
||||||
word_str = " " if whitespace_pattern.match(line) else pieces.pop(0)
|
|
||||||
if vec_len == -1:
|
|
||||||
vec_len = len(pieces)
|
|
||||||
elif vec_len != len(pieces):
|
|
||||||
raise VectorReadError.mismatched_sizes(file_, line_num,
|
|
||||||
vec_len, len(pieces))
|
|
||||||
orth = self.strings[word_str]
|
|
||||||
lexeme = <LexemeC*><void*>self.get_by_orth(self.mem, orth)
|
|
||||||
lexeme.vector = <float*>self.mem.alloc(vec_len, sizeof(float))
|
|
||||||
for i, val_str in enumerate(pieces):
|
|
||||||
lexeme.vector[i] = float(val_str)
|
|
||||||
norm = 0.0
|
|
||||||
for i in range(vec_len):
|
|
||||||
norm += lexeme.vector[i] * lexeme.vector[i]
|
|
||||||
lexeme.l2_norm = sqrt(norm)
|
|
||||||
self.vectors_length = vec_len
|
|
||||||
return vec_len
|
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
|
||||||
"""Load vectors from the location of a binary file.
|
|
||||||
|
|
||||||
loc (unicode): The path of the binary file to load from.
|
|
||||||
|
|
||||||
RETURNS (int): The length of the vectors loaded.
|
|
||||||
"""
|
|
||||||
cdef CFile file_ = CFile(loc, b'rb')
|
|
||||||
cdef int32_t word_len
|
|
||||||
cdef int32_t vec_len = 0
|
|
||||||
cdef int32_t prev_vec_len = 0
|
|
||||||
cdef float* vec
|
|
||||||
cdef Address mem
|
|
||||||
cdef attr_t string_id
|
|
||||||
cdef bytes py_word
|
|
||||||
cdef vector[float*] vectors
|
|
||||||
cdef int line_num = 0
|
|
||||||
cdef Pool tmp_mem = Pool()
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
file_.read_into(&word_len, sizeof(word_len), 1)
|
|
||||||
except IOError:
|
|
||||||
break
|
|
||||||
file_.read_into(&vec_len, sizeof(vec_len), 1)
|
|
||||||
if prev_vec_len != 0 and vec_len != prev_vec_len:
|
|
||||||
raise VectorReadError.mismatched_sizes(loc, line_num,
|
|
||||||
vec_len, prev_vec_len)
|
|
||||||
if 0 >= vec_len >= MAX_VEC_SIZE:
|
|
||||||
raise VectorReadError.bad_size(loc, vec_len)
|
|
||||||
|
|
||||||
chars = <char*>file_.alloc_read(tmp_mem, word_len, sizeof(char))
|
|
||||||
vec = <float*>file_.alloc_read(self.mem, vec_len, sizeof(float))
|
|
||||||
|
|
||||||
string_id = self.strings[chars[:word_len]]
|
|
||||||
# Insert words into vocab to add vector.
|
|
||||||
self.get_by_orth(self.mem, string_id)
|
|
||||||
while string_id >= vectors.size():
|
|
||||||
vectors.push_back(EMPTY_VEC)
|
|
||||||
assert vec != NULL
|
|
||||||
vectors[string_id] = vec
|
|
||||||
line_num += 1
|
|
||||||
cdef LexemeC* lex
|
|
||||||
cdef size_t lex_addr
|
|
||||||
cdef double norm = 0.0
|
|
||||||
cdef int i
|
|
||||||
for orth, lex_addr in self._by_orth.items():
|
|
||||||
lex = <LexemeC*>lex_addr
|
|
||||||
if lex.lower < vectors.size():
|
|
||||||
lex.vector = vectors[lex.lower]
|
|
||||||
norm = 0.0
|
|
||||||
for i in range(vec_len):
|
|
||||||
norm += lex.vector[i] * lex.vector[i]
|
|
||||||
lex.l2_norm = sqrt(norm)
|
|
||||||
else:
|
|
||||||
lex.vector = EMPTY_VEC
|
|
||||||
self.vectors_length = vec_len
|
|
||||||
return vec_len
|
|
||||||
|
|
||||||
|
|
||||||
def resize_vectors(self, int new_size):
|
|
||||||
"""Set vectors_length to a new size, and allocate more memory for the
|
|
||||||
`Lexeme` vectors if necessary. The memory will be zeroed.
|
|
||||||
|
|
||||||
new_size (int): The new size of the vectors.
|
|
||||||
"""
|
|
||||||
cdef hash_t key
|
|
||||||
cdef size_t addr
|
|
||||||
if new_size > self.vectors_length:
|
|
||||||
for key, addr in self._by_hash.items():
|
|
||||||
lex = <LexemeC*>addr
|
|
||||||
lex.vector = <float*>self.mem.realloc(lex.vector,
|
|
||||||
new_size * sizeof(lex.vector[0]))
|
|
||||||
self.vectors_length = new_size
|
|
||||||
|
|
||||||
|
|
||||||
def write_binary_vectors(in_loc, out_loc):
|
|
||||||
cdef CFile out_file = CFile(out_loc, 'wb')
|
|
||||||
cdef Address mem
|
|
||||||
cdef int32_t word_len
|
|
||||||
cdef int32_t vec_len
|
|
||||||
cdef char* chars
|
|
||||||
with bz2.BZ2File(in_loc, 'r') as file_:
|
|
||||||
for line in file_:
|
|
||||||
pieces = line.split()
|
|
||||||
word = pieces.pop(0)
|
|
||||||
mem = Address(len(pieces), sizeof(float))
|
|
||||||
vec = <float*>mem.ptr
|
|
||||||
for i, val_str in enumerate(pieces):
|
|
||||||
vec[i] = float(val_str)
|
|
||||||
|
|
||||||
word_len = len(word)
|
|
||||||
vec_len = len(pieces)
|
|
||||||
|
|
||||||
out_file.write_from(&word_len, 1, sizeof(word_len))
|
|
||||||
out_file.write_from(&vec_len, 1, sizeof(vec_len))
|
|
||||||
|
|
||||||
chars = <char*>word
|
|
||||||
out_file.write_from(chars, len(word), sizeof(char))
|
|
||||||
out_file.write_from(vec, vec_len, sizeof(float))
|
|
||||||
|
|
||||||
|
|
||||||
def pickle_vocab(vocab):
|
def pickle_vocab(vocab):
|
||||||
sstore = vocab.strings
|
sstore = vocab.strings
|
||||||
|
@ -567,21 +407,3 @@ class LookupError(Exception):
|
||||||
"ID of orth: {orth_id}".format(
|
"ID of orth: {orth_id}".format(
|
||||||
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
query=repr(original_string), orth_str=repr(id_string), orth_id=id_)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class VectorReadError(Exception):
|
|
||||||
@classmethod
|
|
||||||
def mismatched_sizes(cls, loc, line_num, prev_size, curr_size):
|
|
||||||
return cls(
|
|
||||||
"Error reading word vectors from %s on line %d.\n"
|
|
||||||
"All vectors must be the same size.\n"
|
|
||||||
"Prev size: %d\n"
|
|
||||||
"Curr size: %d" % (loc, line_num, prev_size, curr_size))
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def bad_size(cls, loc, size):
|
|
||||||
return cls(
|
|
||||||
"Error reading word vectors from %s.\n"
|
|
||||||
"Vector size: %d\n"
|
|
||||||
"Max size: %d\n"
|
|
||||||
"Min size: 1\n" % (loc, size, MAX_VEC_SIZE))
|
|
||||||
|
|
Loading…
Reference in New Issue