From 1424b12b0912e77adaf954a5f9dcd79b6a40d00c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Apr 2019 13:06:37 +0200 Subject: [PATCH 1/6] failing test for Issue #3449 --- spacy/tests/regression/test_issue3449.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 spacy/tests/regression/test_issue3449.py diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py new file mode 100644 index 000000000..40aa43bb7 --- /dev/null +++ b/spacy/tests/regression/test_issue3449.py @@ -0,0 +1,22 @@ +import pytest + +from spacy.lang.en import English + + +@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.") +def test_issue3449(): + nlp = English() + nlp.add_pipe(nlp.create_pipe('sentencizer')) + + text1 = "He gave the ball to I. Do you want to go to the movies with I?" + text2 = "He gave the ball to I. Do you want to go to the movies with I?" + text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" + + t1 = nlp(text1) + t2 = nlp(text2) + t3 = nlp(text3) + + assert t1[5].text == 'I' + assert t2[5].text == 'I' + assert t3[5].text == 'I' + From e7062cf699bdc39d9d5e3aec31bcd073bae46ee5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Apr 2019 13:15:35 +0200 Subject: [PATCH 2/6] failing test for Issue #3521 --- spacy/tests/regression/test_issue3521.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 spacy/tests/regression/test_issue3521.py diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py new file mode 100644 index 000000000..a971ab4c6 --- /dev/null +++ b/spacy/tests/regression/test_issue3521.py @@ -0,0 +1,20 @@ +import pytest + +from spacy.lang.en import English + + +@pytest.mark.parametrize( + "word", + [ + "don't", + "don’t", + "I'd", + "I’d", + ], +) +def test_issue3521(fr_tokenizer, word): + nlp = English() + + tok = nlp(word)[1] + assert tok.is_stop + From eca9cc541792a62073cdf035f880c46850bc5098 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Apr 2019 13:24:59 +0200 Subject: [PATCH 3/6] fixing Issue #3521 by adding all hyphen variants for each stopword --- spacy/lang/en/stop_words.py | 9 ++++++--- spacy/tests/regression/test_issue3521.py | 10 +++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 4301e7d86..bdc36bdd7 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much must my myself name namely neither never nevertheless next nine no nobody none noone nor not -nothing now nowhere n't +nothing now nowhere of off often on once one only onto or other others otherwise our ours ourselves out over own @@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you your yours yourself yourselves - -'d 'll 'm 're 's 've """.split() ) + +for hyphen in ["'", "`", "‘", "´", "’"]: + for stopword in "n't 'd 'll 'm 're 's 've".split(): + STOP_WORDS.add(stopword.replace("'", hyphen)) + diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index a971ab4c6..2a4568a17 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,7 +1,5 @@ import pytest -from spacy.lang.en import English - @pytest.mark.parametrize( "word", @@ -12,9 +10,7 @@ from spacy.lang.en import English "I’d", ], ) -def test_issue3521(fr_tokenizer, word): - nlp = English() - - tok = nlp(word)[1] +def test_issue3521(en_tokenizer, word): + tok = en_tokenizer(word)[1] + # 'not' and 'would' should be stopwords, also in their abbreviated forms assert tok.is_stop - From 673c81bbb4c832c645768302415108327f9c878a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Apr 2019 13:52:07 +0200 Subject: [PATCH 4/6] unicode string for python 2.7 --- spacy/lang/en/stop_words.py | 2 +- spacy/tests/regression/test_issue3521.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index bdc36bdd7..07d4ff34c 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -70,6 +70,6 @@ yet you your yours yourself yourselves ) for hyphen in ["'", "`", "‘", "´", "’"]: - for stopword in "n't 'd 'll 'm 're 's 've".split(): + for stopword in u"n't 'd 'll 'm 're 's 've".split(): STOP_WORDS.add(stopword.replace("'", hyphen)) diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 2a4568a17..1fe5e00d7 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -4,10 +4,10 @@ import pytest @pytest.mark.parametrize( "word", [ - "don't", - "don’t", - "I'd", - "I’d", + u"don't", + u"don’t", + u"I'd", + u"I’d", ], ) def test_issue3521(en_tokenizer, word): From 85b4319f33df36884b42179bca0242dbf6c00f9c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 2 Apr 2019 15:05:31 +0200 Subject: [PATCH 5/6] specify encoding in files --- spacy/tests/regression/test_issue3449.py | 1 + spacy/tests/regression/test_issue3521.py | 1 + 2 files changed, 2 insertions(+) diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py index 40aa43bb7..9f670d5aa 100644 --- a/spacy/tests/regression/test_issue3449.py +++ b/spacy/tests/regression/test_issue3449.py @@ -1,3 +1,4 @@ +# coding: utf8 import pytest from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 1fe5e00d7..12f285099 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,3 +1,4 @@ +# coding: utf8 import pytest From 4ff786e1135846ceef761efb5706e90f551726ef Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Apr 2019 13:50:33 +0200 Subject: [PATCH 6/6] addressed all comments by Ines --- spacy/lang/en/stop_words.py | 9 ++++++--- spacy/tests/regression/test_issue3449.py | 2 ++ spacy/tests/regression/test_issue3521.py | 10 ++++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 07d4ff34c..aae3e5e01 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -69,7 +69,10 @@ yet you your yours yourself yourselves """.split() ) -for hyphen in ["'", "`", "‘", "´", "’"]: - for stopword in u"n't 'd 'll 'm 're 's 've".split(): - STOP_WORDS.add(stopword.replace("'", hyphen)) +contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"] +STOP_WORDS.update(contractions) + +for apostrophe in ["‘", "’"]: + for stopword in contractions: + STOP_WORDS.add(stopword.replace("'", apostrophe)) diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py index 9f670d5aa..61a76334a 100644 --- a/spacy/tests/regression/test_issue3449.py +++ b/spacy/tests/regression/test_issue3449.py @@ -1,4 +1,6 @@ # coding: utf8 +from __future__ import unicode_literals + import pytest from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 12f285099..6d841894a 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,14 +1,16 @@ # coding: utf8 +from __future__ import unicode_literals + import pytest @pytest.mark.parametrize( "word", [ - u"don't", - u"don’t", - u"I'd", - u"I’d", + "don't", + "don’t", + "I'd", + "I’d", ], ) def test_issue3521(en_tokenizer, word):