spaCy/spacy/tests/lang/en/test_customized_tokenizer.py

# coding: utf-8
from __future__ import unicode_literals

import pytest
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_suffix_regex
from spacy.util import compile_infix_regex


@pytest.fixture
def custom_en_tokenizer(en_vocab):
    prefix_re = compile_prefix_regex(English.Defaults.prefixes)
    suffix_re = compile_suffix_regex(English.Defaults.suffixes)
    custom_infixes = [
        "\.\.\.+",
        "(?<=[0-9])-(?=[0-9])",
        # '(?<=[0-9]+),(?=[0-9]+)',
        "[0-9]+(,[0-9]+)+",
        "[\[\]!&:,()\*—–\/-]",
    ]
    infix_re = compile_infix_regex(custom_infixes)
    return Tokenizer(
        en_vocab,
        English.Defaults.tokenizer_exceptions,
        prefix_re.search,
        suffix_re.search,
        infix_re.finditer,
        token_match=None,
    )


def test_en_customized_tokenizer_handles_infixes(custom_en_tokenizer):
    sentence = "The 8 and 10-county definitions are not used for the greater Southern California Megaregion."
    context = [word.text for word in custom_en_tokenizer(sentence)]
    assert context == [
        "The",
        "8",
        "and",
        "10",
        "-",
        "county",
        "definitions",
        "are",
        "not",
        "used",
        "for",
        "the",
        "greater",
        "Southern",
        "California",
        "Megaregion",
        ".",
    ]
    # the trailing '-' may cause Assertion Error
    sentence = "The 8- and 10-county definitions are not used for the greater Southern California Megaregion."
    context = [word.text for word in custom_en_tokenizer(sentence)]
    assert context == [
        "The",
        "8",
        "-",
        "and",
        "10",
        "-",
        "county",
        "definitions",
        "are",
        "not",
        "used",
        "for",
        "the",
        "greater",
        "Southern",
        "California",
        "Megaregion",
        ".",
    ]