Merge pull request #3530 from svlandeg/fix/issue_3521

Allow English stopwords with any type of apostrophe
2019-04-03 14:14:03 +02:00 · 2019-04-03 14:14:03 +02:00 · 4faf62d515
parent 951825532c 4ff786e113
commit 4faf62d515
3 changed files with 53 additions and 3 deletions
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
 must my myself
 name namely neither never nevertheless next nine no nobody none noone nor not
-nothing now nowhere n't
+nothing now nowhere 
 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
@ -66,7 +66,13 @@ whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would
 yet you your yours yourself yourselves
 'd 'll 'm 're 's 've
 """.split()
 )
 contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
 STOP_WORDS.update(contractions)
 for apostrophe in ["‘", "’"]:
    for stopword in contractions:
        STOP_WORDS.add(stopword.replace("'", apostrophe))
--- a/spacy/tests/regression/test_issue3449.py
+++ b/spacy/tests/regression/test_issue3449.py
@ -0,0 +1,25 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
 from spacy.lang.en import English
@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.")
 def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == 'I'
    assert t2[5].text == 'I'
    assert t3[5].text == 'I'
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@ -0,0 +1,19 @@
 # coding: utf8
 from __future__ import unicode_literals
 import pytest
@pytest.mark.parametrize(
    "word",
    [
        "don't",
        "don’t",
        "I'd",
        "I’d",
    ],
 )
 def test_issue3521(en_tokenizer, word):
    tok = en_tokenizer(word)[1]
    # 'not' and 'would' should be stopwords, also in their abbreviated forms
    assert tok.is_stop