mirror of https://github.com/explosion/spaCy.git
* Remove the vectors option to Vocab, preferring to either load vectors from disk, or set them on the Lexeme objects.
This commit is contained in:
parent
893542afae
commit
27f988b167
|
@ -137,21 +137,14 @@ class Language(object):
|
||||||
return path.join(path.dirname(__file__), 'data')
|
return path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_vectors(cls, data_dir):
|
def default_vocab(cls, data_dir=None, get_lex_attr=None):
|
||||||
return None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None):
|
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
data_dir = cls.default_data_dir()
|
data_dir = cls.default_data_dir()
|
||||||
if vectors is None:
|
|
||||||
vectors = cls.default_vectors(data_dir)
|
|
||||||
if get_lex_attr is None:
|
if get_lex_attr is None:
|
||||||
get_lex_attr = cls.default_lex_attrs(data_dir)
|
get_lex_attr = cls.default_lex_attrs(data_dir)
|
||||||
return Vocab.from_dir(
|
return Vocab.from_dir(
|
||||||
path.join(data_dir, 'vocab'),
|
path.join(data_dir, 'vocab'),
|
||||||
get_lex_attr=get_lex_attr,
|
get_lex_attr=get_lex_attr)
|
||||||
vectors=vectors)
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_tokenizer(cls, vocab, data_dir):
|
def default_tokenizer(cls, vocab, data_dir):
|
||||||
|
@ -214,7 +207,7 @@ class Language(object):
|
||||||
self.entity = entity
|
self.entity = entity
|
||||||
self.matcher = matcher
|
self.matcher = matcher
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False):
|
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
is preserved.
|
is preserved.
|
||||||
|
|
|
@ -56,11 +56,10 @@ cdef class Vocab:
|
||||||
|
|
||||||
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
|
||||||
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
|
||||||
self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
|
self.vectors_length = self.load_vectors(path.join(data_dir, 'vec.bin'))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, get_lex_attr=None, tag_map=None, vectors=None, lemmatizer=None,
|
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
|
||||||
serializer_freqs=None):
|
|
||||||
if tag_map is None:
|
if tag_map is None:
|
||||||
tag_map = {}
|
tag_map = {}
|
||||||
if lemmatizer is None:
|
if lemmatizer is None:
|
||||||
|
@ -262,7 +261,7 @@ cdef class Vocab:
|
||||||
i += 1
|
i += 1
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def load_rep_vectors(self, loc):
|
def load_vectors(self, loc):
|
||||||
cdef CFile file_ = CFile(loc, b'rb')
|
cdef CFile file_ = CFile(loc, b'rb')
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef int32_t vec_len
|
cdef int32_t vec_len
|
||||||
|
|
Loading…
Reference in New Issue