spaCy/website/docs/api/tokenizer.jade

//- 💫 DOCS > API > TOKENIZER

include ../../_includes/_mixins

p
    |  Segment text, and create #[code Doc] objects with the discovered segment
    |  boundaries.

+h(2, "init") Tokenizer.__init__
    +tag method

p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.

+aside-code("Example").
    # Construction 1
    from spacy.tokenizer import Tokenizer
    tokenizer = Tokenizer(nlp.vocab)

    # Construction 2
    from spacy.lang.en import English
    tokenizer = English().Defaults.create_tokenizer(nlp)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell A storage container for lexical types.

    +row
        +cell #[code rules]
        +cell dict
        +cell Exceptions and special-cases for the tokenizer.

    +row
        +cell #[code prefix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match prefixes.

    +row
        +cell #[code suffix_search]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).search] to match suffixes.

    +row
        +cell #[code infix_finditer]
        +cell callable
        +cell
            |  A function matching the signature of
            |  #[code re.compile(string).finditer] to find infixes.

    +row
        +cell #[code token_match]
        +cell callable
        +cell A boolean function matching strings to be recognised as tokens.

    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The newly constructed object.

+h(2, "call") Tokenizer.__call__
    +tag method

p Tokenize a string.

+aside-code("Example").
    tokens = tokenizer(u'This is a sentence')
    assert len(tokens) == 4

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to tokenize.

    +footrow
        +cell returns
        +cell #[code Doc]
        +cell A container for linguistic annotations.

+h(2, "pipe") Tokenizer.pipe
    +tag method

p Tokenize a stream of texts.

+aside-code("Example").
    texts = [u'One document.', u'...', u'Lots of documents']
    for doc in tokenizer.pipe(texts, batch_size=50):
        pass

+table(["Name", "Type", "Description"])
    +row
        +cell #[code texts]
        +cell -
        +cell A sequence of unicode texts.

    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of texts to accumulate in an internal buffer.

    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads to use, if the implementation supports
            |  multi-threading. The default tokenizer is single-threaded.

    +footrow
        +cell yields
        +cell #[code Doc]
        +cell A sequence of Doc objects, in order.

+h(2, "find_infix") Tokenizer.find_infix
    +tag method

p Find internal split points of the string.

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to split.

    +footrow
        +cell returns
        +cell list
        +cell
            |  A list of #[code re.MatchObject] objects that have #[code .start()]
            |  and #[code .end()] methods, denoting the placement of internal
            |  segment separators, e.g. hyphens.

+h(2, "find_prefix") Tokenizer.find_prefix
    +tag method

p
    |  Find the length of a prefix that should be segmented from the string, or
    |  #[code None] if no prefix rules match.

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.

    +footrow
        +cell returns
        +cell int
        +cell The length of the prefix if present, otherwise #[code None].

+h(2, "find_suffix") Tokenizer.find_suffix
    +tag method

p
    |  Find the length of a suffix that should be segmented from the string, or
    |  #[code None] if no suffix rules match.

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to segment.

    +footrow
        +cell returns
        +cell int / #[code None]
        +cell The length of the suffix if present, otherwise #[code None].

+h(2, "add_special_case") Tokenizer.add_special_case
    +tag method

p
    |  Add a special-case tokenization rule. This mechanism is also used to add
    |  custom tokenizer exceptions to the language data. See the usage workflow
    |  on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]
    |  for more details and examples.

+aside-code("Example").
    from spacy.attrs import ORTH, LEMMA
    case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]
    tokenizer.add_special_case(case)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to specially tokenize.

    +row
        +cell #[code token_attrs]
        +cell iterable
        +cell
            |  A sequence of dicts, where each dict describes a token and its
            |  attributes. The #[code ORTH] fields of the attributes must
            |  exactly match the string when they are concatenated.

+h(2, "to_disk") Tokenizer.to_disk
    +tag method

p Save the current state to a directory.

+aside-code("Example").
    tokenizer.to_disk('/path/to/tokenizer')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory, which will be created if it doesn't exist.
            |  Paths may be either strings or #[code Path]-like objects.

+h(2, "from_disk") Tokenizer.from_disk
    +tag method

p Loads state from a directory. Modifies the object in place and returns it.

+aside-code("Example").
    from spacy.tokenizer import Tokenizer
    tokenizer = Tokenizer(nlp.vocab)
    tokenizer = tokenizer.from_disk('/path/to/tokenizer')

+table(["Name", "Type", "Description"])
    +row
        +cell #[code path]
        +cell unicode or #[code Path]
        +cell
            |  A path to a directory. Paths may be either strings or
            |  #[code Path]-like objects.

    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The modified #[code Tokenizer] object.

+h(2, "to_bytes") Tokenizer.to_bytes
    +tag method

p Serialize the current state to a binary string.

+aside-code("Example").
    tokenizer_bytes = tokenizer.to_bytes()

+table(["Name", "Type", "Description"])
    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being serialized.

    +footrow
        +cell returns
        +cell bytes
        +cell The serialized form of the #[code Tokenizer] object.

+h(2, "from_bytes") Tokenizer.from_bytes
    +tag method

p Load state from a binary string.

+aside-code("Example").
    fron spacy.tokenizer import Tokenizer
    tokenizer_bytes = tokenizer.to_bytes()
    new_tokenizer = Tokenizer(nlp.vocab)
    new_tokenizer.from_bytes(tokenizer_bytes)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code bytes_data]
        +cell bytes
        +cell The data to load from.

    +row
        +cell #[code **exclude]
        +cell -
        +cell Named attributes to prevent from being loaded.

    +footrow
        +cell returns
        +cell #[code Tokenizer]
        +cell The #[code Tokenizer] object.

+h(2, "attributes") Attributes

+table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell The vocab object of the parent #[code Doc].

    +row
        +cell #[code prefix_search]
        +cell -
        +cell
            |  A function to find segment boundaries from the start of a
            |  string. Returns the length of the segment, or #[code None].

    +row
        +cell #[code suffix_search]
        +cell -
        +cell
            |  A function to find segment boundaries from the end of a string.
            |  Returns the length of the segment, or #[code None].

    +row
        +cell #[code infix_finditer]
        +cell -
        +cell
            |  A function to find internal segment separators, e.g. hyphens.
            |  Returns a (possibly empty) list of #[code re.MatchObject]
            |  objects.
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`//- 💫 DOCS > API > TOKENIZER`

			`include ../../_includes/_mixins`

			`p`
			`\| Segment text, and create #[code Doc] objects with the discovered segment`
			`\| boundaries.`

			`+h(2, "init") Tokenizer.__init__`
			`+tag method`

			`p Create a #[code Tokenizer], to create #[code Doc] objects given unicode text.`

Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+aside-code("Example").`
			`# Construction 1`
			`from spacy.tokenizer import Tokenizer`
			`tokenizer = Tokenizer(nlp.vocab)`

			`# Construction 2`
			`from spacy.lang.en import English`
			`tokenizer = English().Defaults.create_tokenizer(nlp)`

Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code vocab]`
			`+cell #[code Vocab]`
			`+cell A storage container for lexical types.`

			`+row`
			`+cell #[code rules]`
			`+cell dict`
			`+cell Exceptions and special-cases for the tokenizer.`

			`+row`
			`+cell #[code prefix_search]`
			`+cell callable`
			`+cell`
			`\| A function matching the signature of`
			`\| #[code re.compile(string).search] to match prefixes.`

			`+row`
			`+cell #[code suffix_search]`
			`+cell callable`
			`+cell`
			`\| A function matching the signature of`
			`\| #[code re.compile(string).search] to match suffixes.`

			`+row`
			`+cell #[code infix_finditer]`
			`+cell callable`
			`+cell`
			`\| A function matching the signature of`
			`\| #[code re.compile(string).finditer] to find infixes.`

Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+row`
			`+cell #[code token_match]`
			`+cell callable`
			`+cell A boolean function matching strings to be recognised as tokens.`

Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell #[code Tokenizer]`
			`+cell The newly constructed object.`

			`+h(2, "call") Tokenizer.__call__`
			`+tag method`

			`p Tokenize a string.`

Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+aside-code("Example").`
			`tokens = tokenizer(u'This is a sentence')`
			`assert len(tokens) == 4`

Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to tokenize.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell #[code Doc]`
			`+cell A container for linguistic annotations.`

			`+h(2, "pipe") Tokenizer.pipe`
			`+tag method`

			`p Tokenize a stream of texts.`

Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+aside-code("Example").`
			`texts = [u'One document.', u'...', u'Lots of documents']`
			`for doc in tokenizer.pipe(texts, batch_size=50):`
			`pass`

Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code texts]`
			`+cell -`
			`+cell A sequence of unicode texts.`

			`+row`
			`+cell #[code batch_size]`
			`+cell int`
			`+cell The number of texts to accumulate in an internal buffer.`

			`+row`
			`+cell #[code n_threads]`
			`+cell int`
			`+cell`
			`\| The number of threads to use, if the implementation supports`
			`\| multi-threading. The default tokenizer is single-threaded.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell yields`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell #[code Doc]`
			`+cell A sequence of Doc objects, in order.`

			`+h(2, "find_infix") Tokenizer.find_infix`
			`+tag method`

			`p Find internal split points of the string.`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to split.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+cell list`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell`
Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`\| A list of #[code re.MatchObject] objects that have #[code .start()]`
			`\| and #[code .end()] methods, denoting the placement of internal`
			`\| segment separators, e.g. hyphens.`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00
			`+h(2, "find_prefix") Tokenizer.find_prefix`
			`+tag method`

			`p`
			`\| Find the length of a prefix that should be segmented from the string, or`
			`\| #[code None] if no prefix rules match.`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to segment.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+cell int`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell The length of the prefix if present, otherwise #[code None].`

			`+h(2, "find_suffix") Tokenizer.find_suffix`
			`+tag method`

			`p`
			`\| Find the length of a suffix that should be segmented from the string, or`
			`\| #[code None] if no suffix rules match.`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to segment.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell int / #[code None]`
			`+cell The length of the suffix if present, otherwise #[code None].`

			`+h(2, "add_special_case") Tokenizer.add_special_case`
			`+tag method`

Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`p`
			`\| Add a special-case tokenization rule. This mechanism is also used to add`
			`\| custom tokenizer exceptions to the language data. See the usage workflow`
			`\| on #[+a("/docs/usage/adding-languages#tokenizer-exceptions") adding languages]`
			`\| for more details and examples.`

			`+aside-code("Example").`
			`from spacy.attrs import ORTH, LEMMA`
			`case = [{"don't": [{ORTH: "do"}, {ORTH: "n't", LEMMA: "not"}]}]`
			`tokenizer.add_special_case(case)`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code string]`
			`+cell unicode`
			`+cell The string to specially tokenize.`

			`+row`
			`+cell #[code token_attrs]`
Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+cell iterable`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell`
			`\| A sequence of dicts, where each dict describes a token and its`
			`\| attributes. The #[code ORTH] fields of the attributes must`
			`\| exactly match the string when they are concatenated.`

Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+h(2, "to_disk") Tokenizer.to_disk`
			`+tag method`

			`p Save the current state to a directory.`

			`+aside-code("Example").`
			`tokenizer.to_disk('/path/to/tokenizer')`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code path]`
			`+cell unicode or #[code Path]`
			`+cell`
			`\| A path to a directory, which will be created if it doesn't exist.`
			`\| Paths may be either strings or #[code Path]-like objects.`

			`+h(2, "from_disk") Tokenizer.from_disk`
			`+tag method`

			`p Loads state from a directory. Modifies the object in place and returns it.`

			`+aside-code("Example").`
			`from spacy.tokenizer import Tokenizer`
			`tokenizer = Tokenizer(nlp.vocab)`
			`tokenizer = tokenizer.from_disk('/path/to/tokenizer')`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code path]`
			`+cell unicode or #[code Path]`
			`+cell`
			`\| A path to a directory. Paths may be either strings or`
			`\| #[code Path]-like objects.`

Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+cell #[code Tokenizer]`
			`+cell The modified #[code Tokenizer] object.`

			`+h(2, "to_bytes") Tokenizer.to_bytes`
			`+tag method`

			`p Serialize the current state to a binary string.`

			`+aside-code("Example").`
			`tokenizer_bytes = tokenizer.to_bytes()`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code **exclude]`
Add documentation for Tokenizer API (see #600) 2016-11-02 22:17:42 +00:00			`+cell -`
Update docstrings and API docs for Tokenizer 2017-05-21 11:18:14 +00:00			`+cell Named attributes to prevent from being serialized.`

			`+footrow`
			`+cell returns`
			`+cell bytes`
			`+cell The serialized form of the #[code Tokenizer] object.`

			`+h(2, "from_bytes") Tokenizer.from_bytes`
			`+tag method`

			`p Load state from a binary string.`

			`+aside-code("Example").`
			`fron spacy.tokenizer import Tokenizer`
			`tokenizer_bytes = tokenizer.to_bytes()`
			`new_tokenizer = Tokenizer(nlp.vocab)`
			`new_tokenizer.from_bytes(tokenizer_bytes)`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code bytes_data]`
			`+cell bytes`
			`+cell The data to load from.`

			`+row`
			`+cell #[code **exclude]`
			`+cell -`
			`+cell Named attributes to prevent from being loaded.`

			`+footrow`
			`+cell returns`
			`+cell #[code Tokenizer]`
			`+cell The #[code Tokenizer] object.`

Move attributes and remove deprecated methods 2017-05-20 23:18:31 +00:00			`+h(2, "attributes") Attributes`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code vocab]`
			`+cell #[code Vocab]`
			`+cell The vocab object of the parent #[code Doc].`

			`+row`
			`+cell #[code prefix_search]`
			`+cell -`
			`+cell`
			`\| A function to find segment boundaries from the start of a`
			`\| string. Returns the length of the segment, or #[code None].`

			`+row`
			`+cell #[code suffix_search]`
			`+cell -`
			`+cell`
			`\| A function to find segment boundaries from the end of a string.`
			`\| Returns the length of the segment, or #[code None].`

			`+row`
			`+cell #[code infix_finditer]`
			`+cell -`
			`+cell`
			`\| A function to find internal segment separators, e.g. hyphens.`
			`\| Returns a (possibly empty) list of #[code re.MatchObject]`
			`\| objects.`