spaCy/website/docs/_api-vocab.jade

155 lines
4.9 KiB
Plaintext

//- ----------------------------------
//- 💫 DOCS > API > VOCAB
//- ----------------------------------
+section("vocab")
+h(2, "vocab", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/vocab.pyx")
| #[+tag class] Vocab
p
| A look-up table that allows you to access #[code.lang-python Lexeme]
| objects. The #[code.lang-python Vocab] instance also provides access to
| the #[code.lang-python StringStore], and owns underlying C-data that
| is shared between #[code.lang-python Doc] objects.
+aside('Caveat').
You should avoid working with #[code Doc], #[code Token] or #[code Span]
objects backed by multiple different #[code Vocab] instances, as
they may assume inconsistent string-to-integer encodings. All #[code Doc]
objects produced by the same #[code Language] instance will hold
a reference to the same #[code Vocab] instance.
+code("python", "Overview").
class Vocab:
StringStore strings
Morphology morphology
dict get_lex_attr
int vectors_length
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
return self
@classmethod
def load(cls, data_dir, get_lex_attr):
return Vocab()
@classmethod
def from_package(cls, package, get_lx_attr=None, vectors_package=None):
return Vocab()
property serializer:
return Packer()
def __len__(self):
return int
def __contains__(self, string):
return bool
def __getitem__(self, id_or_string):
return Lexeme()
def dump(self, loc):
return None
def load_lexemes(self, loc):
return None
def dump_vectors(self, out_loc):
return None
def load_vectors(self, file_):
return int
def load_vectors_from_bin_loc(self, loc):
return int
+table(["Example", "Description"])
+row
+cell #[code.lang-python lexeme = vocab[integer_id]]
+cell.
Get a lexeme by its orth ID.
+row
+cell #[code.lang-python lexeme = vocab[string]]
+cell.
Get a lexeme by the string corresponding to its orth ID.
+row
+cell #[code.lang-python for lexeme in vocab]
+cell.
Iterate over #[code Lexeme] objects.
+row
+cell #[code.lang-python int_id = vocab.strings[u'dog']]
+cell.
Access the #[code StringStore] via #[code vocab.strings]
+row
+cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]
+cell.
Access the from #[code.lang-python Doc]
+section("vocab-dump")
+h(3, "vocab-dump")
| #[+tag method] Vocab.dump
+code("python", "Definition").
def dump(self, loc):
return None
+table(["Name", "Type", "Description"])
+row
+cell loc
+cell #[+a(link_unicode) unicode]
+cell Path where the vocabulary should be saved.
+section("vocab-load_lexemes")
+h(3, "vocab-load_lexemes")
| #[+tag method] Vocab.load_lexemes
+code("python", "Definition").
def load_lexemes(self, loc):
return None
+table(["Name", "Type", "Description"])
+row
+cell loc
+cell #[+a(link_unicode) unicode]
+cell Path to load the lexemes.bin file from.
+section("vocab-dump_vectors")
+h(3, "vocab-dump_vectors")
| #[+tag method] Vocab.dump_vectors
+code("python", "Definition").
def dump_vectors(self, loc):
return None
+section("vocab-loadvectors")
+h(3, "vocab-loadvectors")
| #[+tag method] Vocab.load_vectors
+code("python", "Definition").
def load_vectors(self, file_):
return None
+table(["Name", "Type", "Description"])
+row
+cell file
+cell #[+a(link_unicode) unicode]
+cell A file-like object, to load word vectors from.
+section("vocab-loadvectorsfrombinloc")
+h(3, "vocab-saveload-loadvectorsfrom")
| #[+tag method] Vocab.load_vectors_from_bin_loc
+code("python", "Definition").
def load_vectors_from_bin_loc(self, loc):
return None
+table(["Name", "Type", "Description"])
+row
+cell loc
+cell #[+a(link_unicode) unicode]
+cell.
A path to a file, in spaCy's binary word-vectors file format.