mirror of https://github.com/explosion/spaCy.git
158 lines
5.2 KiB
Plaintext
158 lines
5.2 KiB
Plaintext
//- Docs > API > Vocab
|
|
//- ============================================================================
|
|
|
|
+section('vocab')
|
|
+h2('vocab', 'https://github.com/' + profiles.github + '/spaCy/blob/master/spacy/vocab.pyx#L47')
|
|
| #[+label('tag') class] Vocab
|
|
|
|
p
|
|
| A look-up table that allows you to access #[code.lang-python Lexeme]
|
|
| objects. The #[code.lang-python Vocab] instance also provides access to
|
|
| the #[code.lang-python StringStore], and owns underlying C-data that
|
|
| is shared between #[code.lang-python Doc] objects.
|
|
|
|
+aside('Caveat').
|
|
You should avoid working with #[code Doc], #[code Token] or #[code Span]
|
|
objects backed by multiple different #[code Vocab] instances, as
|
|
they may assume inconsistent string-to-integer encodings. All #[code Doc]
|
|
objects produced by the same #[code Language] instance will hold
|
|
a reference to the same #[code Vocab] instance.
|
|
|
|
+code('python', 'Overview').
|
|
class Vocab:
|
|
StringStore strings
|
|
Morphology morphology
|
|
dict get_lex_attr
|
|
int vectors_length
|
|
|
|
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
|
|
return self
|
|
|
|
@classmethod
|
|
def load(cls, data_dir, get_lex_attr):
|
|
return Vocab()
|
|
|
|
@classmethod
|
|
def from_package(cls, package, get_lx_attr=None, vectors_package=None):
|
|
return Vocab()
|
|
|
|
property serializer:
|
|
return Packer()
|
|
|
|
def __len__(self):
|
|
return int
|
|
|
|
def __contains__(self, string):
|
|
return bool
|
|
|
|
def __getitem__(self, id_or_string):
|
|
return Lexeme()
|
|
|
|
def dump(self, loc):
|
|
return None
|
|
|
|
def load_lexemes(self, loc):
|
|
return None
|
|
|
|
def dump_vectors(self, out_loc):
|
|
return None
|
|
|
|
def load_vectors(self, file_):
|
|
return int
|
|
|
|
def load_vectors_from_bin_loc(self, loc):
|
|
return int
|
|
|
|
+table(['Example', 'Description'], 'code')
|
|
+row
|
|
+cell #[code.lang-python lexeme = vocab[integer_id]]
|
|
+cell.
|
|
Get a lexeme by its orth ID.
|
|
|
|
+row
|
|
+cell #[code.lang-python lexeme = vocab[string]]
|
|
+cell.
|
|
Get a lexeme by the string corresponding to its orth ID.
|
|
|
|
+row
|
|
+cell #[code.lang-python for lexeme in vocab]
|
|
+cell.
|
|
Iterate over #[code Lexeme] objects.
|
|
+row
|
|
+cell #[code.lang-python int_id = vocab.strings[u'dog']]
|
|
+cell.
|
|
Access the #[code StringStore] via #[code vocab.strings]
|
|
+row
|
|
+cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]
|
|
+cell.
|
|
Access the from #[code.lang-python Doc]
|
|
|
|
+section('vocab-dump')
|
|
+h3('vocab-dump')
|
|
| #[+label('tag') method] Vocab.dump
|
|
|
|
+code('python', 'definition').
|
|
def dump(self, loc):
|
|
return None
|
|
|
|
+table(['Name', 'Type', 'Description'], 'params')
|
|
+row
|
|
+cell loc
|
|
+cell #[a(href=link_unicode target='_blank') unicode]
|
|
+cell.
|
|
Path where the vocabulary should be saved.
|
|
|
|
+section('vocab-load_lexemes')
|
|
+h3('vocab-load_lexemes')
|
|
| #[+label('tag') method] Vocab.load_lexemes
|
|
|
|
+code('python', 'definition').
|
|
def load_lexemes(self, loc):
|
|
return None
|
|
|
|
+table(['Name', 'Type', 'Description'], 'params')
|
|
+row
|
|
+cell loc
|
|
+cell #[a(href=link_unicode target='_blank') unicode]
|
|
+cell.
|
|
Path to load the lexemes.bin file from.
|
|
|
|
+section('vocab-dump_vectors')
|
|
+h3('vocab-dump_vectors')
|
|
| #[+label('tag') method] Vocab.dump_vectors
|
|
|
|
+code('python', 'definition').
|
|
def dump_vectors(self, loc):
|
|
return None
|
|
|
|
+section('vocab-loadvectors')
|
|
+h3('vocab-loadvectors')
|
|
| #[+label('tag') method] Vocab.load_vectors
|
|
|
|
+code('python', 'definition').
|
|
def load_vectors(self, file_):
|
|
return None
|
|
|
|
+table(['Name', 'Type', 'Description'], 'params')
|
|
+row
|
|
+cell file
|
|
+cell #[a(href=link_unicode target='_blank') unicode]
|
|
+cell.
|
|
A file-like object, to load word vectors from.
|
|
|
|
|
|
+section('vocab-loadvectorsfrombinloc')
|
|
+h3('vocab-saveload-loadvectorsfrom')
|
|
| #[+label('tag') method] Vocab.load_vectors_from_bin_loc
|
|
|
|
+code('python', 'definition').
|
|
def load_vectors_from_bin_loc(self, loc):
|
|
return None
|
|
|
|
+table(['Name', 'Type', 'Description'], 'params')
|
|
+row
|
|
+cell loc
|
|
+cell #[a(href=link_unicode target='_blank') unicode]
|
|
+cell.
|
|
A path to a file, in spaCy's binary word-vectors file format.
|