2016-11-02 22:17:42 +00:00
|
|
|
//- 💫 DOCS > API > TOKENIZER
|
|
|
|
|
|
|
|
include ../../_includes/_mixins
|
|
|
|
|
|
|
|
p
|
|
|
|
| Segment text, and create #[code Doc] objects with the discovered segment
|
|
|
|
| boundaries.
|
|
|
|
|
|
|
|
+h(2, "init") Tokenizer.__init__
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.
|
|
|
|
|
2017-05-21 11:18:14 +00:00
|
|
|
+aside-code("Example").
|
|
|
|
# Construction 1
|
|
|
|
from spacy.tokenizer import Tokenizer
|
|
|
|
tokenizer = Tokenizer(nlp.vocab)
|
|
|
|
|
|
|
|
# Construction 2
|
|
|
|
from spacy.lang.en import English
|
|
|
|
tokenizer = English().Defaults.create_tokenizer(nlp)
|
|
|
|
|
2016-11-02 22:17:42 +00:00
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code vocab]
|
|
|
|
+cell #[code Vocab]
|
|
|
|
+cell A storage container for lexical types.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code rules]
|
|
|
|
+cell dict
|
|
|
|
+cell Exceptions and special-cases for the tokenizer.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code prefix_search]
|
|
|
|
+cell callable
|
|
|
|
+cell
|
|
|
|
| A function matching the signature of
|
|
|
|
| #[code re.compile(string).search] to match prefixes.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code suffix_search]
|
|
|
|
+cell callable
|
|
|
|
+cell
|
|
|
|
| A function matching the signature of
|
|
|
|
| #[code re.compile(string).search] to match suffixes.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code infix_finditer]
|
|
|
|
+cell callable
|
|
|
|
+cell
|
|
|
|
| A function matching the signature of
|
|
|
|
| #[code re.compile(string).finditer] to find infixes.
|
|
|
|
|
2017-05-21 11:18:14 +00:00
|
|
|
+row
|
|
|
|
+cell #[code token_match]
|
|
|
|
+cell callable
|
|
|
|
+cell A boolean function matching strings to be recognised as tokens.
|
|
|
|
|
2016-11-02 22:17:42 +00:00
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell #[code Tokenizer]
|
|
|
|
+cell The newly constructed object.
|
|
|
|
|
|
|
|
+h(2, "call") Tokenizer.__call__
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p Tokenize a string.
|
|
|
|
|
2017-05-21 11:18:14 +00:00
|
|
|
+aside-code("Example").
|
|
|
|
tokens = tokenizer(u'This is a sentence')
|
|
|
|
assert len(tokens) == 4
|
|
|
|
|
2016-11-02 22:17:42 +00:00
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code string]
|
|
|
|
+cell unicode
|
|
|
|
+cell The string to tokenize.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell #[code Doc]
|
|
|
|
+cell A container for linguistic annotations.
|
|
|
|
|
|
|
|
+h(2, "pipe") Tokenizer.pipe
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p Tokenize a stream of texts.
|
|
|
|
|
2017-05-21 11:18:14 +00:00
|
|
|
+aside-code("Example").
|
|
|
|
texts = [u'One document.', u'...', u'Lots of documents']
|
|
|
|
for doc in tokenizer.pipe(texts, batch_size=50):
|
|
|
|
pass
|
|
|
|
|
2016-11-02 22:17:42 +00:00
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code texts]
|
|
|
|
+cell -
|
|
|
|
+cell A sequence of unicode texts.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code batch_size]
|
|
|
|
+cell int
|
|
|
|
+cell The number of texts to accumulate in an internal buffer.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code n_threads]
|
|
|
|
+cell int
|
|
|
|
+cell
|
|
|
|
| The number of threads to use, if the implementation supports
|
|
|
|
| multi-threading. The default tokenizer is single-threaded.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell yields
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell #[code Doc]
|
|
|
|
+cell A sequence of Doc objects, in order.
|
|
|
|
|
|
|
|
+h(2, "find_infix") Tokenizer.find_infix
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p Find internal split points of the string.
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code string]
|
|
|
|
+cell unicode
|
|
|
|
+cell The string to split.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2017-05-21 11:18:14 +00:00
|
|
|
+cell list
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell
|
2017-05-21 11:18:14 +00:00
|
|
|
| A list of #[code re.MatchObject] objects that have #[code .start()]
|
|
|
|
| and #[code .end()] methods, denoting the placement of internal
|
|
|
|
| segment separators, e.g. hyphens.
|
2016-11-02 22:17:42 +00:00
|
|
|
|
|
|
|
+h(2, "find_prefix") Tokenizer.find_prefix
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p
|
|
|
|
| Find the length of a prefix that should be segmented from the string, or
|
|
|
|
| #[code None] if no prefix rules match.
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code string]
|
|
|
|
+cell unicode
|
|
|
|
+cell The string to segment.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2017-05-21 11:18:14 +00:00
|
|
|
+cell int
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell The length of the prefix if present, otherwise #[code None].
|
|
|
|
|
|
|
|
+h(2, "find_suffix") Tokenizer.find_suffix
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p
|
|
|
|
| Find the length of a suffix that should be segmented from the string, or
|
|
|
|
| #[code None] if no suffix rules match.
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code string]
|
|
|
|
+cell unicode
|
|
|
|
+cell The string to segment.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell int / #[code None]
|
|
|
|
+cell The length of the suffix if present, otherwise #[code None].
|
|
|
|
|
|
|
|
+h(2, "add_special_case") Tokenizer.add_special_case
|
|
|
|
+tag method
|
|
|
|
|
2017-05-21 11:18:14 +00:00
|
|
|
p
|
|
|
|
| Add a special-case tokenization rule. This mechanism is also used to add
|
2017-05-28 14:41:01 +00:00
|
|
|
| custom tokenizer exceptions to the language data. See the usage guide
|
2017-05-21 11:18:14 +00:00
|
|
|
| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
|
|
|
|
| for more details and examples.
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
from spacy.attrs import ORTH, LEMMA
|
|
|
|
case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
|
|
|
|
tokenizer.add_special_case(case)
|
2016-11-02 22:17:42 +00:00
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code string]
|
|
|
|
+cell unicode
|
|
|
|
+cell The string to specially tokenize.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code token_attrs]
|
2017-05-21 11:18:14 +00:00
|
|
|
+cell iterable
|
2016-11-02 22:17:42 +00:00
|
|
|
+cell
|
|
|
|
| A sequence of dicts, where each dict describes a token and its
|
|
|
|
| attributes. The #[code ORTH] fields of the attributes must
|
|
|
|
| exactly match the string when they are concatenated.
|
|
|
|
|
2017-05-20 23:18:31 +00:00
|
|
|
+h(2, "attributes") Attributes
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code vocab]
|
|
|
|
+cell #[code Vocab]
|
|
|
|
+cell The vocab object of the parent #[code Doc].
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code prefix_search]
|
|
|
|
+cell -
|
|
|
|
+cell
|
|
|
|
| A function to find segment boundaries from the start of a
|
|
|
|
| string. Returns the length of the segment, or #[code None].
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code suffix_search]
|
|
|
|
+cell -
|
|
|
|
+cell
|
|
|
|
| A function to find segment boundaries from the end of a string.
|
|
|
|
| Returns the length of the segment, or #[code None].
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code infix_finditer]
|
|
|
|
+cell -
|
|
|
|
+cell
|
|
|
|
| A function to find internal segment separators, e.g. hyphens.
|
|
|
|
| Returns a (possibly empty) list of #[code re.MatchObject]
|
|
|
|
| objects.
|