spaCy/website/docs/_api-vocab.jade

//- ----------------------------------
//- 💫 DOCS > API > VOCAB
//- ----------------------------------

+section("vocab")
    +h(2, "vocab", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/vocab.pyx")
        | #[+tag class] Vocab

    p
        | A look-up table that allows you to access #[code.lang-python Lexeme]
        | objects. The #[code.lang-python Vocab] instance also provides access to
        | the #[code.lang-python StringStore], and owns underlying C-data that
        | is shared between #[code.lang-python Doc] objects.

        +aside('Caveat').
            You should avoid working with #[code Doc], #[code Token] or #[code Span]
            objects backed by multiple different #[code Vocab] instances, as
            they may assume inconsistent string-to-integer encodings. All #[code Doc]
            objects produced by the same #[code Language] instance will hold
            a reference to the same #[code Vocab] instance.

    +code("python", "Overview").
        class Vocab:
            StringStore strings
            Morphology morphology
            dict get_lex_attr
            int vectors_length

            def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
                return self

            @classmethod
            def load(cls, data_dir, get_lex_attr):
                return Vocab()

            @classmethod
            def from_package(cls, package, get_lx_attr=None, vectors_package=None):
                return Vocab()

            property serializer:
                return Packer()

            def __len__(self):
                return int

            def __contains__(self, string):
                return bool

            def __getitem__(self, id_or_string):
                return Lexeme()

            def dump(self, loc):
                return None

            def load_lexemes(self, loc):
                return None

            def dump_vectors(self, out_loc):
                return None

            def load_vectors(self, file_):
                return int

            def load_vectors_from_bin_loc(self, loc):
                return int

    +table(["Example", "Description"])
        +row
            +cell #[code.lang-python lexeme = vocab[integer_id]]
            +cell.
                Get a lexeme by its orth ID.

        +row
            +cell #[code.lang-python lexeme = vocab[string]]
            +cell.
                Get a lexeme by the string corresponding to its orth ID.

        +row
            +cell #[code.lang-python for lexeme in vocab]
            +cell.
                Iterate over #[code Lexeme] objects.
        +row
            +cell #[code.lang-python int_id = vocab.strings[u'dog']]
            +cell.
                Access the #[code StringStore] via #[code vocab.strings]
        +row
            +cell #[code.lang-python nlp.vocab is nlp.tokenizer.vocab]
            +cell.
                Access the from #[code.lang-python Doc]

    +section("vocab-dump")
        +h(3, "vocab-dump")
            | #[+tag method] Vocab.dump

        +code("python", "Definition").
            def dump(self, loc):
                return None

        +table(["Name", "Type", "Description"])
            +row
                +cell loc
                +cell #[+a(link_unicode) unicode]
                +cell Path where the vocabulary should be saved.

    +section("vocab-load_lexemes")
        +h(3, "vocab-load_lexemes")
            | #[+tag method] Vocab.load_lexemes

        +code("python", "Definition").
            def load_lexemes(self, loc):
                return None

        +table(["Name", "Type", "Description"])
            +row
                +cell loc
                +cell #[+a(link_unicode) unicode]
                +cell Path to load the lexemes.bin file from.

        +section("vocab-dump_vectors")
            +h(3, "vocab-dump_vectors")
                | #[+tag method] Vocab.dump_vectors

            +code("python", "Definition").
                def dump_vectors(self, loc):
                    return None

    +section("vocab-loadvectors")
        +h(3, "vocab-loadvectors")
            | #[+tag method] Vocab.load_vectors

        +code("python", "Definition").
            def load_vectors(self, file_):
                return None

        +table(["Name", "Type", "Description"])
            +row
                +cell file
                +cell #[+a(link_unicode) unicode]
                +cell A file-like object, to load word vectors from.

    +section("vocab-loadvectorsfrombinloc")
        +h(3, "vocab-saveload-loadvectorsfrom")
            | #[+tag method] Vocab.load_vectors_from_bin_loc

        +code("python", "Definition").
            def load_vectors_from_bin_loc(self, loc):
                return None

        +table(["Name", "Type", "Description"])
            +row
                +cell loc
                +cell #[+a(link_unicode) unicode]
                +cell.
                    A path to a file, in spaCy's binary word-vectors file  format.