mirror of https://github.com/explosion/spaCy.git
Merge pull request #3530 from svlandeg/fix/issue_3521
Allow English stopwords with any type of apostrophe
This commit is contained in:
commit
4faf62d515
|
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
|
|||
must my myself
|
||||
|
||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||
nothing now nowhere n't
|
||||
nothing now nowhere
|
||||
|
||||
of off often on once one only onto or other others otherwise our ours ourselves
|
||||
out over own
|
||||
|
@ -66,7 +66,13 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
|||
whither who whoever whole whom whose why will with within without would
|
||||
|
||||
yet you your yours yourself yourselves
|
||||
|
||||
'd 'll 'm 're 's 've
|
||||
""".split()
|
||||
)
|
||||
|
||||
contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
|
||||
STOP_WORDS.update(contractions)
|
||||
|
||||
for apostrophe in ["‘", "’"]:
|
||||
for stopword in contractions:
|
||||
STOP_WORDS.add(stopword.replace("'", apostrophe))
|
||||
|
||||
|
|
|
@ -0,0 +1,25 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.")
|
||||
def test_issue3449():
|
||||
nlp = English()
|
||||
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||
|
||||
text1 = "He gave the ball to I. Do you want to go to the movies with I?"
|
||||
text2 = "He gave the ball to I. Do you want to go to the movies with I?"
|
||||
text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
|
||||
|
||||
t1 = nlp(text1)
|
||||
t2 = nlp(text2)
|
||||
t3 = nlp(text3)
|
||||
|
||||
assert t1[5].text == 'I'
|
||||
assert t2[5].text == 'I'
|
||||
assert t3[5].text == 'I'
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"word",
|
||||
[
|
||||
"don't",
|
||||
"don’t",
|
||||
"I'd",
|
||||
"I’d",
|
||||
],
|
||||
)
|
||||
def test_issue3521(en_tokenizer, word):
|
||||
tok = en_tokenizer(word)[1]
|
||||
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||
assert tok.is_stop
|
Loading…
Reference in New Issue