diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 4301e7d86..aae3e5e01 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much must my myself name namely neither never nevertheless next nine no nobody none noone nor not -nothing now nowhere n't +nothing now nowhere of off often on once one only onto or other others otherwise our ours ourselves out over own @@ -66,7 +66,13 @@ whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves - -'d 'll 'm 're 's 've """.split() ) + +contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"] +STOP_WORDS.update(contractions) + +for apostrophe in ["‘", "’"]: + for stopword in contractions: + STOP_WORDS.add(stopword.replace("'", apostrophe)) + diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py new file mode 100644 index 000000000..61a76334a --- /dev/null +++ b/spacy/tests/regression/test_issue3449.py @@ -0,0 +1,25 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from spacy.lang.en import English + + +@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.") +def test_issue3449(): + nlp = English() + nlp.add_pipe(nlp.create_pipe('sentencizer')) + + text1 = "He gave the ball to I. Do you want to go to the movies with I?" + text2 = "He gave the ball to I. Do you want to go to the movies with I?" + text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" + + t1 = nlp(text1) + t2 = nlp(text2) + t3 = nlp(text3) + + assert t1[5].text == 'I' + assert t2[5].text == 'I' + assert t3[5].text == 'I' + diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py new file mode 100644 index 000000000..6d841894a --- /dev/null +++ b/spacy/tests/regression/test_issue3521.py @@ -0,0 +1,19 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "word", + [ + "don't", + "don’t", + "I'd", + "I’d", + ], +) +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms + assert tok.is_stop