spaCy/tests/pipeline/test_sentencizer.py

88 lines
2.9 KiB
Python

# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.pipeline import Sentencizer
from spacy.tokens import Doc
def test_sentencizer(en_vocab):
doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
sentencizer = Sentencizer()
doc = sentencizer(doc)
assert doc.is_sentenced
sent_starts = [t.is_sent_start for t in doc]
assert sent_starts == [True, False, True, False, False, False, False]
assert len(list(doc.sents)) == 2
@pytest.mark.parametrize(
"words,sent_starts,n_sents",
[
# The expected result here is that the duplicate punctuation gets merged
# onto the same sentence and no one-token sentence is created for them.
(
["Hello", "!", ".", "Test", ".", ".", "ok"],
[True, False, False, True, False, False, True],
3,
),
# We also want to make sure ¡ and ¿ aren't treated as sentence end
# markers, even though they're punctuation
(
["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
[True, False, False, False, True, False, False, False, False, False],
2,
),
# The Token.is_punct check ensures that quotes are handled as well
(
['"', "Nice", "!", '"', "I", "am", "happy", "."],
[True, False, False, False, True, False, False, False],
2,
),
],
)
def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer()
doc = sentencizer(doc)
assert doc.is_sentenced
assert [t.is_sent_start for t in doc] == sent_starts
assert len(list(doc.sents)) == n_sents
@pytest.mark.parametrize(
"punct_chars,words,sent_starts,n_sents",
[
(
["~", "?"],
["Hello", "world", "~", "A", ".", "B", "."],
[True, False, False, True, False, False, False],
2,
),
# Even thought it's not common, the punct_chars should be able to
# handle any tokens
(
[".", "ö"],
["Hello", ".", "Test", "ö", "Ok", "."],
[True, False, True, False, True, False],
3,
),
],
)
def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
doc = Doc(en_vocab, words=words)
sentencizer = Sentencizer(punct_chars=punct_chars)
doc = sentencizer(doc)
assert doc.is_sentenced
assert [t.is_sent_start for t in doc] == sent_starts
assert len(list(doc.sents)) == n_sents
def test_sentencizer_serialize_bytes(en_vocab):
punct_chars = [".", "~", "+"]
sentencizer = Sentencizer(punct_chars=punct_chars)
assert sentencizer.punct_chars == set(punct_chars)
bytes_data = sentencizer.to_bytes()
new_sentencizer = Sentencizer().from_bytes(bytes_data)
assert new_sentencizer.punct_chars == set(punct_chars)