spaCy/website/docs/api/tokenizer.jade

315 lines
8.3 KiB
Plaintext

//- 💫 DOCS > API > TOKENIZER
include ../../_includes/_mixins
p
| Segment text, and create #[code Doc] objects with the discovered segment
| boundaries.
+h(2, "init") Tokenizer.__init__
+tag method
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
+aside-code("Example").
# Construction 1
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
# Construction 2
from spacy.lang.en import English
tokenizer = English().Defaults.create_tokenizer(nlp)
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell A storage container for lexical types.
+row
+cell #[code rules]
+cell dict
+cell Exceptions and special-cases for the tokenizer.
+row
+cell #[code prefix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match prefixes.
+row
+cell #[code suffix_search]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).search] to match suffixes.
+row
+cell #[code infix_finditer]
+cell callable
+cell
| A function matching the signature of
| #[code re.compile(string).finditer] to find infixes.
+row
+cell #[code token_match]
+cell callable
+cell A boolean function matching strings to be recognised as tokens.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The newly constructed object.
+h(2, "call") Tokenizer.__call__
+tag method
p Tokenize a string.
+aside-code("Example").
tokens = tokenizer(u'This is a sentence')
assert len(tokens) == 4
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to tokenize.
+footrow
+cell returns
+cell #[code Doc]
+cell A container for linguistic annotations.
+h(2, "pipe") Tokenizer.pipe
+tag method
p Tokenize a stream of texts.
+aside-code("Example").
texts = [u'One document.', u'...', u'Lots of documents']
for doc in tokenizer.pipe(texts, batch_size=50):
pass
+table(["Name", "Type", "Description"])
+row
+cell #[code texts]
+cell -
+cell A sequence of unicode texts.
+row
+cell #[code batch_size]
+cell int
+cell The number of texts to accumulate in an internal buffer.
+row
+cell #[code n_threads]
+cell int
+cell
| The number of threads to use, if the implementation supports
| multi-threading. The default tokenizer is single-threaded.
+footrow
+cell yields
+cell #[code Doc]
+cell A sequence of Doc objects, in order.
+h(2, "find_infix") Tokenizer.find_infix
+tag method
p Find internal split points of the string.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to split.
+footrow
+cell returns
+cell list
+cell
| A list of #[code re.MatchObject] objects that have #[code .start()]
| and #[code .end()] methods, denoting the placement of internal
| segment separators, e.g. hyphens.
+h(2, "find_prefix") Tokenizer.find_prefix
+tag method
p
| Find the length of a prefix that should be segmented from the string, or
| #[code None] if no prefix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell returns
+cell int
+cell The length of the prefix if present, otherwise #[code None].
+h(2, "find_suffix") Tokenizer.find_suffix
+tag method
p
| Find the length of a suffix that should be segmented from the string, or
| #[code None] if no suffix rules match.
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to segment.
+footrow
+cell returns
+cell int / #[code None]
+cell The length of the suffix if present, otherwise #[code None].
+h(2, "add_special_case") Tokenizer.add_special_case
+tag method
p
| Add a special-case tokenization rule. This mechanism is also used to add
| custom tokenizer exceptions to the language data. See the usage workflow
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
| for more details and examples.
+aside-code("Example").
from spacy.attrs import ORTH, LEMMA
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
tokenizer.add_special_case(case)
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to specially tokenize.
+row
+cell #[code token_attrs]
+cell iterable
+cell
| A sequence of dicts, where each dict describes a token and its
| attributes. The #[code ORTH] fields of the attributes must
| exactly match the string when they are concatenated.
+h(2, "to_disk") Tokenizer.to_disk
+tag method
p Save the current state to a directory.
+aside-code("Example").
tokenizer.to_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory, which will be created if it doesn't exist.
| Paths may be either strings or #[code Path]-like objects.
+h(2, "from_disk") Tokenizer.from_disk
+tag method
p Loads state from a directory. Modifies the object in place and returns it.
+aside-code("Example").
from spacy.tokenizer import Tokenizer
tokenizer = Tokenizer(nlp.vocab)
tokenizer = tokenizer.from_disk('/path/to/tokenizer')
+table(["Name", "Type", "Description"])
+row
+cell #[code path]
+cell unicode or #[code Path]
+cell
| A path to a directory. Paths may be either strings or
| #[code Path]-like objects.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The modified #[code Tokenizer] object.
+h(2, "to_bytes") Tokenizer.to_bytes
+tag method
p Serialize the current state to a binary string.
+aside-code("Example").
tokenizer_bytes = tokenizer.to_bytes()
+table(["Name", "Type", "Description"])
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being serialized.
+footrow
+cell returns
+cell bytes
+cell The serialized form of the #[code Tokenizer] object.
+h(2, "from_bytes") Tokenizer.from_bytes
+tag method
p Load state from a binary string.
+aside-code("Example").
fron spacy.tokenizer import Tokenizer
tokenizer_bytes = tokenizer.to_bytes()
new_tokenizer = Tokenizer(nlp.vocab)
new_tokenizer.from_bytes(tokenizer_bytes)
+table(["Name", "Type", "Description"])
+row
+cell #[code bytes_data]
+cell bytes
+cell The data to load from.
+row
+cell #[code **exclude]
+cell -
+cell Named attributes to prevent from being loaded.
+footrow
+cell returns
+cell #[code Tokenizer]
+cell The #[code Tokenizer] object.
+h(2, "attributes") Attributes
+table(["Name", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell The vocab object of the parent #[code Doc].
+row
+cell #[code prefix_search]
+cell -
+cell
| A function to find segment boundaries from the start of a
| string. Returns the length of the segment, or #[code None].
+row
+cell #[code suffix_search]
+cell -
+cell
| A function to find segment boundaries from the end of a string.
| Returns the length of the segment, or #[code None].
+row
+cell #[code infix_finditer]
+cell -
+cell
| A function to find internal segment separators, e.g. hyphens.
| Returns a (possibly empty) list of #[code re.MatchObject]
| objects.