mirror of https://github.com/explosion/spaCy.git
155 lines
4.9 KiB
Plaintext
155 lines
4.9 KiB
Plaintext
//- ----------------------------------
|
|
//- 💫 DOCS > API > VOCAB
|
|
//- ----------------------------------
|
|
|
|
+section("vocab")
|
|
+h(2, "vocab", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/vocab.pyx")
|
|
| #[+tag class] Vocab
|
|
|
|
p
|
|
| A look-up table that allows you to access #[code.lang-python Lexeme]
|
|
| objects. The #[code.lang-python Vocab] instance also provides access to
|
|
| the #[code.lang-python StringStore], and owns underlying C-data that
|
|
| is shared between #[code.lang-python Doc] objects.
|
|
|
|
+aside('Caveat').
|
|
You should avoid working with #[code Doc], #[code Token] or #[code Span]
|
|
objects backed by multiple different #[code Vocab] instances, as
|
|
they may assume inconsistent string-to-integer encodings. All #[code Doc]
|
|
objects produced by the same #[code Language] instance will hold
|
|
a reference to the same #[code Vocab] instance.
|
|
|
|
+code("python", "Overview").
|
|
class Vocab:
|
|
StringStore strings
|
|
Morphology morphology
|
|
dict get_lex_attr
|
|
int vectors_length
|
|
|
|
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
|
|
return self
|
|
|
|
@classmethod
|
|
def load(cls, data_dir, get_lex_attr):
|
|
return Vocab()
|
|
|
|
@classmethod
|
|
def from_package(cls, package, get_lx_attr=None, vectors_package=None):
|
|
return Vocab()
|
|
|
|
property serializer:
|
|
return Packer()
|
|
|
|
def __len__(self):
|
|
return int
|
|
|
|
def __contains__(self, string):
|
|
return bool
|
|
|
|
def __getitem__(self, id_or_string):
|
|
return Lexeme()
|
|
|
|
def dump(self, loc):
|
|
return None
|
|
|
|
def load_lexemes(self, loc):
|
|
return None
|
|
|
|
def dump_vectors(self, out_loc):
|
|
return None
|
|
|
|
def load_vectors(self, file_):
|
|
return int
|
|
|
|
def load_vectors_from_bin_loc(self, loc):
|
|
return int
|
|
|
|
+table(["Example", "Description"])
|
|
+row
|
|
+cell #[code.lang-python lexeme = vocab[integer_id]]
|
|
+cell.
|
|
Get a lexeme by its orth ID.
|
|
|
|
+row
|
|
+cell #[code.lang-python lexeme = vocab[string]]
|
|
+cell.
|
|
Get a lexeme by the string corresponding to its orth ID.
|
|
|
|
+row
|
|
+cell #[code.lang-python for lexeme in vocab]
|
|
+cell.
|
|
Iterate over #[code Lexeme] objects.
|
|
+row
|
|
+cell #[code.lang-python int_id = vocab.strings[u'dog']]
|
|
+cell.
|
|
Access the #[code StringStore] via #[code vocab.strings]
|
|
+row
|
|
+cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]
|
|
+cell.
|
|
Access the from #[code.lang-python Doc]
|
|
|
|
+section("vocab-dump")
|
|
+h(3, "vocab-dump")
|
|
| #[+tag method] Vocab.dump
|
|
|
|
+code("python", "Definition").
|
|
def dump(self, loc):
|
|
return None
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell loc
|
|
+cell #[+a(link_unicode) unicode]
|
|
+cell Path where the vocabulary should be saved.
|
|
|
|
+section("vocab-load_lexemes")
|
|
+h(3, "vocab-load_lexemes")
|
|
| #[+tag method] Vocab.load_lexemes
|
|
|
|
+code("python", "Definition").
|
|
def load_lexemes(self, loc):
|
|
return None
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell loc
|
|
+cell #[+a(link_unicode) unicode]
|
|
+cell Path to load the lexemes.bin file from.
|
|
|
|
+section("vocab-dump_vectors")
|
|
+h(3, "vocab-dump_vectors")
|
|
| #[+tag method] Vocab.dump_vectors
|
|
|
|
+code("python", "Definition").
|
|
def dump_vectors(self, loc):
|
|
return None
|
|
|
|
+section("vocab-loadvectors")
|
|
+h(3, "vocab-loadvectors")
|
|
| #[+tag method] Vocab.load_vectors
|
|
|
|
+code("python", "Definition").
|
|
def load_vectors(self, file_):
|
|
return None
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell file
|
|
+cell #[+a(link_unicode) unicode]
|
|
+cell A file-like object, to load word vectors from.
|
|
|
|
+section("vocab-loadvectorsfrombinloc")
|
|
+h(3, "vocab-saveload-loadvectorsfrom")
|
|
| #[+tag method] Vocab.load_vectors_from_bin_loc
|
|
|
|
+code("python", "Definition").
|
|
def load_vectors_from_bin_loc(self, loc):
|
|
return None
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell loc
|
|
+cell #[+a(link_unicode) unicode]
|
|
+cell.
|
|
A path to a file, in spaCy's binary word-vectors file format.
|