From 7ba4111554cd8637763947d21decbab25bd95369 Mon Sep 17 00:00:00 2001 From: Aaron Marquez Date: Thu, 15 Feb 2018 12:46:22 -0800 Subject: [PATCH] Add test for issue-1959 --- spacy/tests/regression/test_issue1959.py | 25 ++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 spacy/tests/regression/test_issue1959.py diff --git a/spacy/tests/regression/test_issue1959.py b/spacy/tests/regression/test_issue1959.py new file mode 100644 index 000000000..2605b0088 --- /dev/null +++ b/spacy/tests/regression/test_issue1959.py @@ -0,0 +1,25 @@ +# coding: utf8 +from __future__ import unicode_literals +import pytest + +from ..util import load_test_model + + +@pytest.mark.models('en') +def test_issue1959(): + texts = ['Apple is looking at buying U.K. startup for $1 billion.'] + nlp = load_test_model('en_core_web_sm') + nlp.add_pipe(clean_component, name='cleaner', after='ner') + doc = nlp(texts[0]) + doc_pipe = [doc_pipe for doc_pipe in nlp.pipe(texts)] + assert doc == doc_pipe[0] + + +def clean_component(doc): + """ Clean up text. Make lowercase and remove punctuation and stopwords """ + # Remove punctuation, symbols (#) and stopwords + doc = [tok.text.lower() for tok in doc if (not tok.is_stop + and tok.pos_ != 'PUNCT' and + tok.pos_ != 'SYM')] + doc = ' '.join(doc) + return doc