From 1424b12b0912e77adaf954a5f9dcd79b6a40d00c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Apr 2019 13:06:37 +0200
Subject: [PATCH 1/6] failing test for Issue #3449

---
 spacy/tests/regression/test_issue3449.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue3449.py

diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py
new file mode 100644
index 000000000..40aa43bb7
--- /dev/null
+++ b/spacy/tests/regression/test_issue3449.py
@@ -0,0 +1,22 @@
+import pytest
+
+from spacy.lang.en import English
+
+
+@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.")
+def test_issue3449():
+    nlp = English()
+    nlp.add_pipe(nlp.create_pipe('sentencizer'))
+
+    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
+    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
+    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
+
+    t1 = nlp(text1)
+    t2 = nlp(text2)
+    t3 = nlp(text3)
+
+    assert t1[5].text == 'I'
+    assert t2[5].text == 'I'
+    assert t3[5].text == 'I'
+

From e7062cf699bdc39d9d5e3aec31bcd073bae46ee5 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Apr 2019 13:15:35 +0200
Subject: [PATCH 2/6] failing test for Issue #3521

---
 spacy/tests/regression/test_issue3521.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 spacy/tests/regression/test_issue3521.py

diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
new file mode 100644
index 000000000..a971ab4c6
--- /dev/null
+++ b/spacy/tests/regression/test_issue3521.py
@@ -0,0 +1,20 @@
+import pytest
+
+from spacy.lang.en import English
+
+
+@pytest.mark.parametrize(
+    "word",
+    [
+        "don't",
+        "don’t",
+        "I'd",
+        "I’d",
+    ],
+)
+def test_issue3521(fr_tokenizer, word):
+    nlp = English()
+
+    tok = nlp(word)[1]
+    assert tok.is_stop
+

From eca9cc541792a62073cdf035f880c46850bc5098 Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Apr 2019 13:24:59 +0200
Subject: [PATCH 3/6] fixing Issue #3521 by adding all hyphen variants for each
 stopword

---
 spacy/lang/en/stop_words.py              |  9 ++++++---
 spacy/tests/regression/test_issue3521.py | 10 +++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py
index 4301e7d86..bdc36bdd7 100644
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
 must my myself
 
 name namely neither never nevertheless next nine no nobody none noone nor not
-nothing now nowhere n't
+nothing now nowhere 
 
 of off often on once one only onto or other others otherwise our ours ourselves
 out over own
@@ -66,7 +66,10 @@ whereafter whereas whereby wherein whereupon wherever whether which while
 whither who whoever whole whom whose why will with within without would
 
 yet you your yours yourself yourselves
-
-'d 'll 'm 're 's 've
 """.split()
 )
+
+for hyphen in ["'", "`", "‘", "´", "’"]:
+    for stopword in "n't 'd 'll 'm 're 's 've".split():
+        STOP_WORDS.add(stopword.replace("'", hyphen))
+
diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
index a971ab4c6..2a4568a17 100644
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@@ -1,7 +1,5 @@
 import pytest
 
-from spacy.lang.en import English
-
 
 @pytest.mark.parametrize(
     "word",
@@ -12,9 +10,7 @@ from spacy.lang.en import English
         "I’d",
     ],
 )
-def test_issue3521(fr_tokenizer, word):
-    nlp = English()
-
-    tok = nlp(word)[1]
+def test_issue3521(en_tokenizer, word):
+    tok = en_tokenizer(word)[1]
+    # 'not' and 'would' should be stopwords, also in their abbreviated forms
     assert tok.is_stop
-

From 673c81bbb4c832c645768302415108327f9c878a Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Apr 2019 13:52:07 +0200
Subject: [PATCH 4/6] unicode string for python 2.7

---
 spacy/lang/en/stop_words.py              | 2 +-
 spacy/tests/regression/test_issue3521.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py
index bdc36bdd7..07d4ff34c 100644
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@@ -70,6 +70,6 @@ yet you your yours yourself yourselves
 )
 
 for hyphen in ["'", "`", "‘", "´", "’"]:
-    for stopword in "n't 'd 'll 'm 're 's 've".split():
+    for stopword in u"n't 'd 'll 'm 're 's 've".split():
         STOP_WORDS.add(stopword.replace("'", hyphen))
 
diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
index 2a4568a17..1fe5e00d7 100644
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@@ -4,10 +4,10 @@ import pytest
 @pytest.mark.parametrize(
     "word",
     [
-        "don't",
-        "don’t",
-        "I'd",
-        "I’d",
+        u"don't",
+        u"don’t",
+        u"I'd",
+        u"I’d",
     ],
 )
 def test_issue3521(en_tokenizer, word):

From 85b4319f33df36884b42179bca0242dbf6c00f9c Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Tue, 2 Apr 2019 15:05:31 +0200
Subject: [PATCH 5/6] specify encoding in files

---
 spacy/tests/regression/test_issue3449.py | 1 +
 spacy/tests/regression/test_issue3521.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py
index 40aa43bb7..9f670d5aa 100644
--- a/spacy/tests/regression/test_issue3449.py
+++ b/spacy/tests/regression/test_issue3449.py
@@ -1,3 +1,4 @@
+# coding: utf8
 import pytest
 
 from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
index 1fe5e00d7..12f285099 100644
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@@ -1,3 +1,4 @@
+# coding: utf8
 import pytest
 
 

From 4ff786e1135846ceef761efb5706e90f551726ef Mon Sep 17 00:00:00 2001
From: svlandeg <sofie.vanlandeghem@gmail.com>
Date: Wed, 3 Apr 2019 13:50:33 +0200
Subject: [PATCH 6/6] addressed all comments by Ines

---
 spacy/lang/en/stop_words.py              |  9 ++++++---
 spacy/tests/regression/test_issue3449.py |  2 ++
 spacy/tests/regression/test_issue3521.py | 10 ++++++----
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py
index 07d4ff34c..aae3e5e01 100644
--- a/spacy/lang/en/stop_words.py
+++ b/spacy/lang/en/stop_words.py
@@ -69,7 +69,10 @@ yet you your yours yourself yourselves
 """.split()
 )
 
-for hyphen in ["'", "`", "‘", "´", "’"]:
-    for stopword in u"n't 'd 'll 'm 're 's 've".split():
-        STOP_WORDS.add(stopword.replace("'", hyphen))
+contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
+STOP_WORDS.update(contractions)
+
+for apostrophe in ["‘", "’"]:
+    for stopword in contractions:
+        STOP_WORDS.add(stopword.replace("'", apostrophe))
 
diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py
index 9f670d5aa..61a76334a 100644
--- a/spacy/tests/regression/test_issue3449.py
+++ b/spacy/tests/regression/test_issue3449.py
@@ -1,4 +1,6 @@
 # coding: utf8
+from __future__ import unicode_literals
+
 import pytest
 
 from spacy.lang.en import English
diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py
index 12f285099..6d841894a 100644
--- a/spacy/tests/regression/test_issue3521.py
+++ b/spacy/tests/regression/test_issue3521.py
@@ -1,14 +1,16 @@
 # coding: utf8
+from __future__ import unicode_literals
+
 import pytest
 
 
 @pytest.mark.parametrize(
     "word",
     [
-        u"don't",
-        u"don’t",
-        u"I'd",
-        u"I’d",
+        "don't",
+        "don’t",
+        "I'd",
+        "I’d",
     ],
 )
 def test_issue3521(en_tokenizer, word):