spaCy/website/docs/api/vocab.jade

257 lines
6.7 KiB
Plaintext
Raw Normal View History

2016-10-31 18:04:15 +00:00
//- 💫 DOCS > API > VOCAB
include ../../_includes/_mixins
p
| A look-up table that allows you to access #[code Lexeme] objects. The
| #[code Vocab] instance also provides access to the #[code StringStore],
| and owns underlying C-data that is shared between #[code Doc] objects.
+h(2, "init") Vocab.__init__
+tag method
2016-10-31 18:04:15 +00:00
p Create the vocabulary.
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code lex_attr_getters]
+cell dict
+cell
| A dictionary mapping attribute IDs to functions to compute them.
| Defaults to #[code None].
+row
+cell #[code tag_map]
+cell dict
+cell
| A dictionary mapping fine-grained tags to coarse-grained
| parts-of-speech, and optionally morphological attributes.
+row
+cell #[code lemmatizer]
+cell object
2016-10-31 18:04:15 +00:00
+cell A lemmatizer. Defaults to #[code None].
+row
+cell #[code strings]
+cell #[code StringStore]
2016-10-31 18:04:15 +00:00
+cell
| A #[code StringStore] that maps strings to integers, and vice
| versa.
2016-10-31 18:04:15 +00:00
+footrow
+cell returns
2016-10-31 18:04:15 +00:00
+cell #[code Vocab]
+cell The newly constructed object.
+h(2, "len") Vocab.__len__
+tag method
p Get the current number of lexemes in the vocabulary.
+aside-code("Example").
doc = nlp(u'This is a sentence.')
assert len(nlp.vocab) > 0
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+footrow
+cell returns
2016-10-31 18:04:15 +00:00
+cell int
+cell The number of lexems in the vocabulary.
+h(2, "getitem") Vocab.__getitem__
+tag method
p
| Retrieve a lexeme, given an int ID or a unicode string. If a previously
| unseen unicode string is given, a new lexeme is created and stored.
+aside-code("Example").
apple = nlp.vocab.strings['apple']
assert nlp.vocab[apple] == nlp.vocab[u'apple']
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code id_or_string]
+cell int / unicode
+cell The integer ID of a word, or its unicode string.
+footrow
+cell returns
2016-10-31 18:04:15 +00:00
+cell #[code Lexeme]
+cell The lexeme indicated by the given ID.
2017-05-20 11:00:13 +00:00
+h(2, "iter") Vocab.__iter__
2016-10-31 18:04:15 +00:00
+tag method
p Iterate over the lexemes in the vocabulary.
+aside-code("Example").
stop_words = (lex for lex in nlp.vocab if lex.is_stop)
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+footrow
+cell yields
2016-10-31 18:04:15 +00:00
+cell #[code Lexeme]
+cell An entry in the vocabulary.
+h(2, "contains") Vocab.__contains__
+tag method
p
| Check whether the string has an entry in the vocabulary. To get the ID
| for a given string, you need to look it up in
| #[+api("vocab#attributes") #[code vocab.strings]].
+aside-code("Example").
apple = nlp.vocab.strings['apple']
oov = nlp.vocab.strings['dskfodkfos']
assert apple in nlp.vocab
assert oov not in nlp.vocab
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The ID string.
+footrow
+cell returns
2016-10-31 18:04:15 +00:00
+cell bool
+cell Whether the string has an entry in the vocabulary.
+h(2, "add_flag") Vocab.add_flag
2016-10-31 18:04:15 +00:00
+tag method
p
| Set a new boolean flag to words in the vocabulary. The #[code flag_getter]
| function will be called over the words currently in the vocab, and then
| applied to new words as they occur. You'll then be able to access the flag
| value on each token, using #[code token.check_flag(flag_id)].
2016-10-31 18:04:15 +00:00
+aside-code("Example").
def is_my_product(text):
products = [u'spaCy', u'Thinc', u'displaCy']
return text in products
2016-10-31 18:04:15 +00:00
MY_PRODUCT = nlp.vocab.add_flag(is_my_product)
doc = nlp(u'I like spaCy')
assert doc[2].check_flag(MY_PRODUCT) == True
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code flag_getter]
+cell dict
+cell A function #[code f(unicode) -> bool], to get the flag value.
+row
+cell #[code flag_id]
+cell int
+cell
| An integer between 1 and 63 (inclusive), specifying the bit at
| which the flag will be stored. If #[code -1], the lowest
| available bit will be chosen.
+footrow
+cell returns
2016-10-31 18:04:15 +00:00
+cell int
+cell The integer ID by which the flag value can be checked.
+h(2, "to_disk") Vocab.to_disk
2016-10-31 18:04:15 +00:00
+tag method
p Save the current state to a directory.
+aside-code("Example").
nlp.vocab.to_disk('/path/to/vocab')
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
2016-10-31 18:04:15 +00:00
+h(2, "from_disk") Vocab.from_disk
2016-10-31 18:04:15 +00:00
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.vocab import Vocab
vocab = Vocab().from_disk('/path/to/vocab')
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
2016-10-31 18:04:15 +00:00
+footrow
+cell returns
+cell #[code Vocab]
+cell The modified #[code Vocab] object.
2016-10-31 18:04:15 +00:00
+h(2, "to_bytes") Vocab.to_bytes
2016-10-31 18:04:15 +00:00
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
vocab_bytes = nlp.vocab.to_bytes()
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
2016-10-31 18:04:15 +00:00
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Vocab] object.
2016-10-31 18:04:15 +00:00
+h(2, "from_bytes") Vocab.from_bytes
2016-10-31 18:04:15 +00:00
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.vocab import Vocab
vocab_bytes = nlp.vocab.to_bytes()
vocab = Vocab()
vocab.from_bytes(vocab_bytes)
2016-10-31 18:04:15 +00:00
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
2016-10-31 18:04:15 +00:00
+footrow
+cell returns
+cell #[code Vocab]
+cell The #[code Vocab] object.
+h(2, "attributes") Attributes
+aside-code("Example").
apple_id = nlp.vocab.strings['apple']
assert type(apple_id) == int
PERSON = nlp.vocab.strings['PERSON']
assert type(PERSON) == int
+table(["Name", "Type", "Description"])
+row
+cell #[code strings]
+cell #[code StringStore]
+cell A table managing the string-to-int mapping.