Merge pull request #3530 from svlandeg/fix/issue_3521

Allow English stopwords with any type of apostrophe
This commit is contained in:
Ines Montani 2019-04-03 14:14:03 +02:00 committed by GitHub
commit 4faf62d515
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 3 deletions

View File

@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
must my myself must my myself
name namely neither never nevertheless next nine no nobody none noone nor not name namely neither never nevertheless next nine no nobody none noone nor not
nothing now nowhere n't nothing now nowhere
of off often on once one only onto or other others otherwise our ours ourselves of off often on once one only onto or other others otherwise our ours ourselves
out over own out over own
@ -66,7 +66,13 @@ whereafter whereas whereby wherein whereupon wherever whether which while
whither who whoever whole whom whose why will with within without would whither who whoever whole whom whose why will with within without would
yet you your yours yourself yourselves yet you your yours yourself yourselves
'd 'll 'm 're 's 've
""".split() """.split()
) )
contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
STOP_WORDS.update(contractions)
for apostrophe in ["", ""]:
for stopword in contractions:
STOP_WORDS.add(stopword.replace("'", apostrophe))

View File

@ -0,0 +1,25 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from spacy.lang.en import English
@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.")
def test_issue3449():
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
text1 = "He gave the ball to I. Do you want to go to the movies with I?"
text2 = "He gave the ball to I. Do you want to go to the movies with I?"
text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
t1 = nlp(text1)
t2 = nlp(text2)
t3 = nlp(text3)
assert t1[5].text == 'I'
assert t2[5].text == 'I'
assert t3[5].text == 'I'

View File

@ -0,0 +1,19 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.parametrize(
"word",
[
"don't",
"dont",
"I'd",
"Id",
],
)
def test_issue3521(en_tokenizer, word):
tok = en_tokenizer(word)[1]
# 'not' and 'would' should be stopwords, also in their abbreviated forms
assert tok.is_stop