mirror of https://github.com/explosion/spaCy.git
Merge pull request #3530 from svlandeg/fix/issue_3521
Allow English stopwords with any type of apostrophe
This commit is contained in:
commit
4faf62d515
|
@ -39,7 +39,7 @@ made make many may me meanwhile might mine more moreover most mostly move much
|
||||||
must my myself
|
must my myself
|
||||||
|
|
||||||
name namely neither never nevertheless next nine no nobody none noone nor not
|
name namely neither never nevertheless next nine no nobody none noone nor not
|
||||||
nothing now nowhere n't
|
nothing now nowhere
|
||||||
|
|
||||||
of off often on once one only onto or other others otherwise our ours ourselves
|
of off often on once one only onto or other others otherwise our ours ourselves
|
||||||
out over own
|
out over own
|
||||||
|
@ -66,7 +66,13 @@ whereafter whereas whereby wherein whereupon wherever whether which while
|
||||||
whither who whoever whole whom whose why will with within without would
|
whither who whoever whole whom whose why will with within without would
|
||||||
|
|
||||||
yet you your yours yourself yourselves
|
yet you your yours yourself yourselves
|
||||||
|
|
||||||
'd 'll 'm 're 's 've
|
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
contractions = ["n't", "'d", "'ll", "'m", "'re", "'s", "'ve"]
|
||||||
|
STOP_WORDS.update(contractions)
|
||||||
|
|
||||||
|
for apostrophe in ["‘", "’"]:
|
||||||
|
for stopword in contractions:
|
||||||
|
STOP_WORDS.add(stopword.replace("'", apostrophe))
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.lang.en import English
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail(reason="Current default suffix rules avoid one upper-case letter before a dot.")
|
||||||
|
def test_issue3449():
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(nlp.create_pipe('sentencizer'))
|
||||||
|
|
||||||
|
text1 = "He gave the ball to I. Do you want to go to the movies with I?"
|
||||||
|
text2 = "He gave the ball to I. Do you want to go to the movies with I?"
|
||||||
|
text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
|
||||||
|
|
||||||
|
t1 = nlp(text1)
|
||||||
|
t2 = nlp(text2)
|
||||||
|
t3 = nlp(text3)
|
||||||
|
|
||||||
|
assert t1[5].text == 'I'
|
||||||
|
assert t2[5].text == 'I'
|
||||||
|
assert t3[5].text == 'I'
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"word",
|
||||||
|
[
|
||||||
|
"don't",
|
||||||
|
"don’t",
|
||||||
|
"I'd",
|
||||||
|
"I’d",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_issue3521(en_tokenizer, word):
|
||||||
|
tok = en_tokenizer(word)[1]
|
||||||
|
# 'not' and 'would' should be stopwords, also in their abbreviated forms
|
||||||
|
assert tok.is_stop
|
Loading…
Reference in New Issue