2019-04-02 13:05:31 +00:00
|
|
|
|
# coding: utf8
|
2019-04-03 11:50:33 +00:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
2019-04-02 11:15:35 +00:00
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
|
"word",
|
|
|
|
|
[
|
2019-04-03 11:50:33 +00:00
|
|
|
|
"don't",
|
|
|
|
|
"don’t",
|
|
|
|
|
"I'd",
|
|
|
|
|
"I’d",
|
2019-04-02 11:15:35 +00:00
|
|
|
|
],
|
|
|
|
|
)
|
2019-04-02 11:24:59 +00:00
|
|
|
|
def test_issue3521(en_tokenizer, word):
|
|
|
|
|
tok = en_tokenizer(word)[1]
|
|
|
|
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
2019-04-02 11:15:35 +00:00
|
|
|
|
assert tok.is_stop
|