spaCy/website/api/vocab.jade

//- 💫 DOCS > API > VOCAB

include ../_includes/_mixins

p
    |  The #[code Vocab] object provides a lookup table that allows you to
    |  access #[+api("lexeme") #[code Lexeme]] objects, as well as the
    |  #[+api("stringstore") #[code StringStore]]. It also owns underlying
    |  C-data that is shared between #[code Doc] objects.

+h(2, "init") Vocab.__init__
    +tag method

p Create the vocabulary.

+aside-code("Example").
    from spacy.vocab import Vocab
    vocab = Vocab(strings=[u'hello', u'world'])

+table(["Name", "Type", "Description"])
    +row
        +cell #[code lex_attr_getters]
        +cell dict
        +cell
            |  A dictionary mapping attribute IDs to functions to compute them.
            |  Defaults to #[code None].

    +row
        +cell #[code tag_map]
        +cell dict
        +cell
            |  A dictionary mapping fine-grained tags to coarse-grained
            |  parts-of-speech, and optionally morphological attributes.

    +row
        +cell #[code lemmatizer]
        +cell object
        +cell A lemmatizer. Defaults to #[code None].

    +row
        +cell #[code strings]
        +cell #[code StringStore] or list
        +cell
            |  A #[+api("stringstore") #[code StringStore]] that maps
            |  strings to hash values, and vice versa, or a list of strings.

    +row("foot")
        +cell returns
        +cell #[code Vocab]
        +cell The newly constructed object.

+h(2, "len") Vocab.__len__
    +tag method

p Get the current number of lexemes in the vocabulary.

+aside-code("Example").
    doc = nlp(u'This is a sentence.')
    assert len(nlp.vocab) > 0

+table(["Name", "Type", "Description"])
    +row("foot")
        +cell returns
        +cell int
        +cell The number of lexems in the vocabulary.

+h(2, "getitem") Vocab.__getitem__
    +tag method

p
    |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
    |  unseen unicode string is given, a new lexeme is created and stored.

+aside-code("Example").
    apple = nlp.vocab.strings['apple']
    assert nlp.vocab[apple] == nlp.vocab[u'apple']

+table(["Name", "Type", "Description"])
    +row
        +cell #[code id_or_string]
        +cell int / unicode
        +cell The hash value of a word, or its unicode string.

    +row("foot")
        +cell returns
        +cell #[code Lexeme]
        +cell The lexeme indicated by the given ID.

+h(2, "iter") Vocab.__iter__
    +tag method

p Iterate over the lexemes in the vocabulary.

+aside-code("Example").
    stop_words = (lex for lex in nlp.vocab if lex.is_stop)

+table(["Name", "Type", "Description"])
    +row("foot")
        +cell yields
        +cell #[code Lexeme]
        +cell An entry in the vocabulary.

+h(2, "contains") Vocab.__contains__
    +tag method

p
    |  Check whether the string has an entry in the vocabulary. To get the ID
    |  for a given string, you need to look it up in
    |  #[+api("vocab#attributes") #[code vocab.strings]].

+aside-code("Example").
    apple = nlp.vocab.strings['apple']
    oov = nlp.vocab.strings['dskfodkfos']
    assert apple in nlp.vocab
    assert oov not in nlp.vocab

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The ID string.

    +row("foot")
        +cell returns
        +cell bool
        +cell Whether the string has an entry in the vocabulary.

+h(2, "add_flag") Vocab.add_flag
    +tag method

p
    |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
    |  function will be called over the words currently in the vocab, and then
    |  applied to new words as they occur. You'll then be able to access the flag
    |  value on each token, using #[code token.check_flag(flag_id)].

+aside-code("Example").
    def is_my_product(text):
        products = [u'spaCy', u'Thinc', u'displaCy']
        return text in products

    MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
    doc = nlp(u'I like spaCy')
    assert doc[2].check_flag(MY_PRODUCT) == True

+table(["Name", "Type", "Description"])
    +row
        +cell #[code flag_getter]
        +cell dict
        +cell A function #[code f(unicode) -> bool], to get the flag value.

    +row
        +cell #[code flag_id]
        +cell int
        +cell
            |  An integer between 1 and 63 (inclusive), specifying the bit at
            |  which the flag will be stored. If #[code -1], the lowest
            |  available bit will be chosen.

    +row("foot")
        +cell returns
        +cell int
        +cell The integer ID by which the flag value can be checked.

+h(2, "reset_vectors") Vocab.reset_vectors
    +tag method
    +tag-new(2)

p
    |  Drop the current vector table. Because all vectors must be the same
    |  width, you have to call this to change the size of the vectors. Only
    |  one of the #[code width] and #[code shape] keyword arguments can be
    |  specified.

+aside-code("Example").
    nlp.vocab.reset_vectors(width=300)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code width]
        +cell int
        +cell The new width (keyword argument only).

    +row
        +cell #[code shape]
        +cell int
        +cell The new shape (keyword argument only).

+h(2, "prune_vectors") Vocab.prune_vectors
    +tag method
    +tag-new(2)

p
    |  Reduce the current vector table to #[code nr_row] unique entries. Words
    |  mapped to the discarded vectors will be remapped to the closest vector
    |  among those remaining. For example, suppose the original table had
    |  vectors for the words:
    |  #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
    |  vector table to, two rows, we would discard the vectors for "feline"
    |  and "reclined". These words would then be remapped to the closest
    |  remaining vector – so "feline" would have the same vector as "cat",
    |  and "reclined" would have the same vector as "sat". The similarities are
    |  judged by cosine. The original vectors may be large, so the cosines are
    |  calculated in minibatches, to reduce memory usage.

+aside-code("Example").
    nlp.vocab.prune_vectors(10000)
    assert len(nlp.vocab.vectors) &lt;= 1000

+table(["Name", "Type", "Description"])
    +row
        +cell #[code nr_row]
        +cell int
        +cell The number of rows to keep in the vector table.

    +row
        +cell #[code batch_size]
        +cell int
        +cell
            |  Batch of vectors for calculating the similarities. Larger batch
            |  sizes might be faster, while temporarily requiring more memory.

    +row("foot")
        +cell returns
        +cell dict
        +cell
            |  A dictionary keyed by removed words mapped to
            |  #[code (string, score)] tuples, where #[code string] is the entry
            |  the removed word was mapped to, and #[code score] the similarity
            |  score between the two words.

+h(2, "get_vector") Vocab.get_vector
    +tag method
    +tag-new(2)

p
    |  Retrieve a vector for a word in the vocabulary. Words can be looked up
    |  by string or hash value. If no vectors data is loaded, a
    |  #[code ValueError] is raised.

+aside-code("Example").
    nlp.vocab.get_vector(u'apple')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code orth]
        +cell int / unicode
        +cell The hash value of a word, or its unicode string.

    +row("foot")
        +cell returns
        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell
            |  A word vector. Size and shape are determined by the
            |  #[code Vocab.vectors] instance.

+h(2, "set_vector") Vocab.set_vector
    +tag method
    +tag-new(2)

p
    |  Set a vector for a word in the vocabulary. Words can be referenced by
    |  by string or hash value.

+aside-code("Example").
    nlp.vocab.set_vector(u'apple', array([...]))

+table(["Name", "Type", "Description"])
    +row
        +cell #[code orth]
        +cell int / unicode
        +cell The hash value of a word, or its unicode string.

    +row
        +cell #[code vector]
        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
        +cell The vector to set.

+h(2, "has_vector") Vocab.has_vector
    +tag method
    +tag-new(2)

p
    |  Check whether a word has a vector. Returns #[code False] if no vectors
    |  are loaded. Words can be looked up by string or hash value.

+aside-code("Example").
    if nlp.vocab.has_vector(u'apple'):
        vector = nlp.vocab.get_vector(u'apple')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code orth]
        +cell int / unicode
        +cell The hash value of a word, or its unicode string.

    +row("foot")
        +cell returns
        +cell bool
        +cell Whether the word has a vector.

+h(2, "to_disk") Vocab.to_disk
    +tag method
    +tag-new(2)

p Save the current state to a directory.

+aside-code("Example").
    nlp.vocab.to_disk('/path/to/vocab')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.

+h(2, "from_disk") Vocab.from_disk
    +tag method
    +tag-new(2)

p Loads state from a directory. Modifies the object in place and returns it.

+aside-code("Example").
    from spacy.vocab import Vocab
    vocab = Vocab().from_disk('/path/to/vocab')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.

    +row("foot")
        +cell returns
        +cell #[code Vocab]
        +cell The modified #[code Vocab] object.

+h(2, "to_bytes") Vocab.to_bytes
    +tag method

p Serialize the current state to a binary string.

+aside-code("Example").
    vocab_bytes = nlp.vocab.to_bytes()

+table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.

    +row("foot")
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Vocab] object.

+h(2, "from_bytes") Vocab.from_bytes
    +tag method

p Load state from a binary string.

+aside-code("Example").
    fron spacy.vocab import Vocab
    vocab_bytes = nlp.vocab.to_bytes()
    vocab = Vocab()
    vocab.from_bytes(vocab_bytes)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.

    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.

    +row("foot")
        +cell returns
        +cell #[code Vocab]
        +cell The #[code Vocab] object.

+h(2, "attributes") Attributes

+aside-code("Example").
    apple_id = nlp.vocab.strings['apple']
    assert type(apple_id) == int
    PERSON = nlp.vocab.strings['PERSON']
    assert type(PERSON) == int

+table(["Name", "Type", "Description"])
    +row
        +cell #[code strings]
        +cell #[code StringStore]
        +cell A table managing the string-to-int mapping.

    +row
        +cell #[code vectors]
            +tag-new(2)
        +cell #[code Vectors]
        +cell A table associating word IDs to word vectors.

    +row
        +cell #[code vectors_length]
        +cell int
        +cell Number of dimensions for each word vector.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								//- 💫 DOCS > API > VOCAB
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								include ../_includes/_mixins
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								p
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    |  The #[code Vocab] object provides a lookup table that allows you to
 								    |  access #[+api("lexeme") #[code Lexeme]] objects, as well as the
 								    |  #[+api("stringstore") #[code StringStore]]. It also owns underlying
 								    |  C-data that is shared between #[code Doc] objects.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+h(2, "init") Vocab.__init__
 								    +tag method
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p Create the vocabulary.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								+aside-code("Example").
 								    from spacy.vocab import Vocab
 								    vocab = Vocab(strings=[u'hello', u'world'])
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code lex_attr_getters]
 								        +cell dict
 								        +cell
 								            |  A dictionary mapping attribute IDs to functions to compute them.
 								            |  Defaults to #[code None].
 								    +row
 								        +cell #[code tag_map]
 								        +cell dict
 								        +cell
 								            |  A dictionary mapping fine-grained tags to coarse-grained
 								            |  parts-of-speech, and optionally morphological attributes.
 								    +row
 								        +cell #[code lemmatizer]
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell object
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell A lemmatizer. Defaults to #[code None].
 								    +row
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell #[code strings]
-												Fix typos, long integers and tests

											
										
										
											2017-05-28 23:06:49 +00:00
+								        +cell #[code StringStore] or list
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell
-												Fix typos, long integers and tests

											
										
										
											2017-05-28 23:06:49 +00:00
+								            |  A #[+api("stringstore") #[code StringStore]] that maps
 								            |  strings to hash values, and vice versa, or a list of strings.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell #[code Vocab]
 								        +cell The newly constructed object.
 								+h(2, "len") Vocab.__len__
 								    +tag method
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p Get the current number of lexemes in the vocabulary.
 								+aside-code("Example").
 								    doc = nlp(u'This is a sentence.')
 								    assert len(nlp.vocab) > 0
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell int
 								        +cell The number of lexems in the vocabulary.
 								+h(2, "getitem") Vocab.__getitem__
 								    +tag method
 								p
 								    |  Retrieve a lexeme, given an int ID or a unicode string. If a previously
 								    |  unseen unicode string is given, a new lexeme is created and stored.
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+aside-code("Example").
 								    apple = nlp.vocab.strings['apple']
 								    assert nlp.vocab[apple] == nlp.vocab[u'apple']
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code id_or_string]
 								        +cell int / unicode
-												Update docs and change integer IDs to hash values

											
										
										
											2017-05-28 17:25:34 +00:00
+								        +cell The hash value of a word, or its unicode string.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell #[code Lexeme]
 								        +cell The lexeme indicated by the given ID.
-												Fix typo

											
										
										
											2017-05-20 11:00:13 +00:00
+								+h(2, "iter") Vocab.__iter__
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								    +tag method
 								p Iterate over the lexemes in the vocabulary.
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+aside-code("Example").
 								    stop_words = (lex for lex in nlp.vocab if lex.is_stop)
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								+table(["Name", "Type", "Description"])
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell yields
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell #[code Lexeme]
 								        +cell An entry in the vocabulary.
 								+h(2, "contains") Vocab.__contains__
 								    +tag method
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p
 								    |  Check whether the string has an entry in the vocabulary. To get the ID
 								    |  for a given string, you need to look it up in
 								    |  #[+api("vocab#attributes") #[code vocab.strings]].
 								+aside-code("Example").
 								    apple = nlp.vocab.strings['apple']
 								    oov = nlp.vocab.strings['dskfodkfos']
 								    assert apple in nlp.vocab
 								    assert oov not in nlp.vocab
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code string]
 								        +cell unicode
 								        +cell The ID string.
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell bool
 								        +cell Whether the string has an entry in the vocabulary.
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+h(2, "add_flag") Vocab.add_flag
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								    +tag method
 								p
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								    |  Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
 								    |  function will be called over the words currently in the vocab, and then
 								    |  applied to new words as they occur. You'll then be able to access the flag
 								    |  value on each token, using #[code token.check_flag(flag_id)].
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+aside-code("Example").
 								    def is_my_product(text):
 								        products = [u'spaCy', u'Thinc', u'displaCy']
 								        return text in products
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								    MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
 								    doc = nlp(u'I like spaCy')
 								    assert doc[2].check_flag(MY_PRODUCT) == True
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code flag_getter]
 								        +cell dict
 								        +cell A function #[code f(unicode) -> bool], to get the flag value.
 								    +row
 								        +cell #[code flag_id]
 								        +cell int
 								        +cell
 								            |  An integer between 1 and 63 (inclusive), specifying the bit at
 								            |  which the flag will be stored. If #[code -1], the lowest
 								            |  available bit will be chosen.
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								        +cell int
 								        +cell The integer ID by which the flag value can be checked.
-												Change clear_vectors to reset_vectors (resolves #1516)

											
										
										
											2017-11-08 17:11:23 +00:00
+								+h(2, "reset_vectors") Vocab.reset_vectors
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +tag method
 								    +tag-new(2)
 								p
 								    |  Drop the current vector table. Because all vectors must be the same
-												Change clear_vectors to reset_vectors (resolves #1516)

											
										
										
											2017-11-08 17:11:23 +00:00
+								    |  width, you have to call this to change the size of the vectors. Only
 								    |  one of the #[code width] and #[code shape] keyword arguments can be
 								    |  specified.
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
 								+aside-code("Example").
-												Change clear_vectors to reset_vectors (resolves #1516)

											
										
										
											2017-11-08 17:11:23 +00:00
+								    nlp.vocab.reset_vectors(width=300)
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
-												Change clear_vectors to reset_vectors (resolves #1516)

											
										
										
											2017-11-08 17:11:23 +00:00
+								        +cell #[code width]
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								        +cell int
-												Change clear_vectors to reset_vectors (resolves #1516)

											
										
										
											2017-11-08 17:11:23 +00:00
+								        +cell The new width (keyword argument only).
 								    +row
 								        +cell #[code shape]
 								        +cell int
 								        +cell The new shape (keyword argument only).
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
-												Update vocab docs and document Vocab.prune_vectors

											
										
										
											2017-10-30 18:35:41 +00:00
+								+h(2, "prune_vectors") Vocab.prune_vectors
 								    +tag method
 								    +tag-new(2)
 								p
 								    |  Reduce the current vector table to #[code nr_row] unique entries. Words
 								    |  mapped to the discarded vectors will be remapped to the closest vector
 								    |  among those remaining. For example, suppose the original table had
 								    |  vectors for the words:
 								    |  #[code.u-break ['sat', 'cat', 'feline', 'reclined']]. If we prune the
 								    |  vector table to, two rows, we would discard the vectors for "feline"
 								    |  and "reclined". These words would then be remapped to the closest
 								    |  remaining vector – so "feline" would have the same vector as "cat",
 								    |  and "reclined" would have the same vector as "sat". The similarities are
 								    |  judged by cosine. The original vectors may be large, so the cosines are
 								    |  calculated in minibatches, to reduce memory usage.
 								+aside-code("Example").
 								    nlp.vocab.prune_vectors(10000)
 								    assert len(nlp.vocab.vectors) &lt;= 1000
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code nr_row]
 								        +cell int
 								        +cell The number of rows to keep in the vector table.
 								    +row
 								        +cell #[code batch_size]
 								        +cell int
 								        +cell
 								            |  Batch of vectors for calculating the similarities. Larger batch
 								            |  sizes might be faster, while temporarily requiring more memory.
 								    +row("foot")
 								        +cell returns
 								        +cell dict
 								        +cell
 								            |  A dictionary keyed by removed words mapped to
 								            |  #[code (string, score)] tuples, where #[code string] is the entry
 								            |  the removed word was mapped to, and #[code score] the similarity
 								            |  score between the two words.
 								+h(2, "get_vector") Vocab.get_vector
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +tag method
 								    +tag-new(2)
 								p
 								    |  Retrieve a vector for a word in the vocabulary. Words can be looked up
 								    |  by string or hash value. If no vectors data is loaded, a
 								    |  #[code ValueError] is raised.
 								+aside-code("Example").
 								    nlp.vocab.get_vector(u'apple')
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code orth]
 								        +cell int / unicode
 								        +cell The hash value of a word, or its unicode string.
 								    +row("foot")
 								        +cell returns
 								        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
 								        +cell
 								            |  A word vector. Size and shape are determined by the
 								            |  #[code Vocab.vectors] instance.
-												Update vocab docs and document Vocab.prune_vectors

											
										
										
											2017-10-30 18:35:41 +00:00
+								+h(2, "set_vector") Vocab.set_vector
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +tag method
 								    +tag-new(2)
 								p
 								    |  Set a vector for a word in the vocabulary. Words can be referenced by
 								    |  by string or hash value.
 								+aside-code("Example").
 								    nlp.vocab.set_vector(u'apple', array([...]))
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code orth]
 								        +cell int / unicode
 								        +cell The hash value of a word, or its unicode string.
 								    +row
 								        +cell #[code vector]
 								        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
 								        +cell The vector to set.
-												Update vocab docs and document Vocab.prune_vectors

											
										
										
											2017-10-30 18:35:41 +00:00
+								+h(2, "has_vector") Vocab.has_vector
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +tag method
 								    +tag-new(2)
 								p
 								    |  Check whether a word has a vector. Returns #[code False] if no vectors
 								    |  are loaded. Words can be looked up by string or hash value.
 								+aside-code("Example").
 								    if nlp.vocab.has_vector(u'apple'):
 								        vector = nlp.vocab.get_vector(u'apple')
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code orth]
 								        +cell int / unicode
 								        +cell The hash value of a word, or its unicode string.
 								    +row("foot")
 								        +cell returns
 								        +cell bool
 								        +cell Whether the word has a vector.
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+h(2, "to_disk") Vocab.to_disk
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								    +tag method
-												Add version tag mixin to label new features

											
										
										
											2017-05-26 10:42:36 +00:00
+								    +tag-new(2)
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p Save the current state to a directory.
 								+aside-code("Example").
 								    nlp.vocab.to_disk('/path/to/vocab')
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell #[code path]
 								        +cell unicode or #[code Path]
 								        +cell
 								            |  A path to a directory, which will be created if it doesn't exist.
 								            |  Paths may be either strings or #[code Path]-like objects.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+h(2, "from_disk") Vocab.from_disk
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								    +tag method
-												Add version tag mixin to label new features

											
										
										
											2017-05-26 10:42:36 +00:00
+								    +tag-new(2)
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p Loads state from a directory. Modifies the object in place and returns it.
 								+aside-code("Example").
 								    from spacy.vocab import Vocab
 								    vocab = Vocab().from_disk('/path/to/vocab')
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell #[code path]
 								        +cell unicode or #[code Path]
 								        +cell
 								            |  A path to a directory. Paths may be either strings or
 								            |  #[code Path]-like objects.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell #[code Vocab]
 								        +cell The modified #[code Vocab] object.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+h(2, "to_bytes") Vocab.to_bytes
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								    +tag method
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p Serialize the current state to a binary string.
 								+aside-code("Example").
 								    vocab_bytes = nlp.vocab.to_bytes()
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell #[code **exclude]
 								        +cell -
 								        +cell Named attributes to prevent from being serialized.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell bytes
 								        +cell The serialized form of the #[code Vocab] object.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								+h(2, "from_bytes") Vocab.from_bytes
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
+								    +tag method
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								p Load state from a binary string.
 								+aside-code("Example").
 								    fron spacy.vocab import Vocab
 								    vocab_bytes = nlp.vocab.to_bytes()
 								    vocab = Vocab()
 								    vocab.from_bytes(vocab_bytes)
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
 								+table(["Name", "Type", "Description"])
 								    +row
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
+								        +cell #[code bytes_data]
 								        +cell bytes
 								        +cell The data to load from.
 								    +row
 								        +cell #[code **exclude]
 								        +cell -
 								        +cell Named attributes to prevent from being loaded.
-												Update to new website

											
										
										
											2016-10-31 18:04:15 +00:00
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
+								    +row("foot")
-												Use returns/yields instead of return/yield

											
										
										
											2017-05-18 22:02:34 +00:00
+								        +cell returns
-												Fix typo and use consistent description for from_bytes

											
										
										
											2017-05-21 11:18:39 +00:00
+								        +cell #[code Vocab]
 								        +cell The #[code Vocab] object.
-												Update docstrings and API docs for Vocab

											
										
										
											2017-05-20 11:59:31 +00:00
 								+h(2, "attributes") Attributes
 								+aside-code("Example").
 								    apple_id = nlp.vocab.strings['apple']
 								    assert type(apple_id) == int
 								    PERSON = nlp.vocab.strings['PERSON']
 								    assert type(PERSON) == int
 								+table(["Name", "Type", "Description"])
 								    +row
 								        +cell #[code strings]
 								        +cell #[code StringStore]
 								        +cell A table managing the string-to-int mapping.
-												Update API documentation

											
										
										
											2017-10-03 12:27:22 +00:00
 								    +row
 								        +cell #[code vectors]
 								            +tag-new(2)
 								        +cell #[code Vectors]
 								        +cell A table associating word IDs to word vectors.
 								    +row
 								        +cell #[code vectors_length]
 								        +cell int
 								        +cell Number of dimensions for each word vector.