diff --git a/spacy/tests/matcher/test_matcher.py b/spacy/tests/matcher/test_matcher.py index 1b75f4f92..95ef930ea 100644 --- a/spacy/tests/matcher/test_matcher.py +++ b/spacy/tests/matcher/test_matcher.py @@ -1,15 +1,107 @@ # coding: utf-8 from __future__ import unicode_literals -from ...matcher import Matcher +from ...matcher import Matcher, PhraseMatcher from ..util import get_doc import pytest +@pytest.fixture +def matcher(en_vocab): + patterns = { + 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], + 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], + 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]] + } + return Matcher(en_vocab, patterns) + + @pytest.mark.parametrize('words', [["Some", "words"]]) def test_matcher_init(en_vocab, words): matcher = Matcher(en_vocab) doc = get_doc(en_vocab, words) assert matcher.n_patterns == 0 assert matcher(doc) == [] + + +def test_matcher_no_match(matcher): + words = ["I", "like", "cheese", "."] + doc = get_doc(matcher.vocab, words) + assert matcher(doc) == [] + + +def test_matcher_compile(matcher): + assert matcher.n_patterns == 3 + + +def test_matcher_match_start(matcher): + words = ["JavaScript", "is", "good"] + doc = get_doc(matcher.vocab, words) + assert matcher(doc) == [(matcher.vocab.strings['JS'], + matcher.vocab.strings['PRODUCT'], 0, 1)] + + +def test_matcher_match_end(matcher): + words = ["I", "like", "java"] + doc = get_doc(matcher.vocab, words) + assert matcher(doc) == [(doc.vocab.strings['Java'], + doc.vocab.strings['PRODUCT'], 2, 3)] + + +def test_matcher_match_middle(matcher): + words = ["I", "like", "Google", "Now", "best"] + doc = get_doc(matcher.vocab, words) + assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], + doc.vocab.strings['PRODUCT'], 2, 4)] + + +def test_matcher_match_multi(matcher): + words = ["I", "like", "Google", "Now", "and", "java", "best"] + doc = get_doc(matcher.vocab, words) + assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], + doc.vocab.strings['PRODUCT'], 2, 4), + (doc.vocab.strings['Java'], + doc.vocab.strings['PRODUCT'], 5, 6)] + + +def test_matcher_phrase_matcher(en_vocab): + words = ["Google", "Now"] + doc = get_doc(en_vocab, words) + matcher = PhraseMatcher(en_vocab, [doc]) + words = ["I", "like", "Google", "Now", "best"] + doc = get_doc(en_vocab, words) + assert len(matcher(doc)) == 1 + + +def test_matcher_match_zero(matcher): + words1 = 'He said , " some words " ...'.split() + words2 = 'He said , " some three words " ...'.split() + pattern1 = [{'ORTH': '"'}, + {'OP': '!', 'IS_PUNCT': True}, + {'OP': '!', 'IS_PUNCT': True}, + {'ORTH': '"'}] + pattern2 = [{'ORTH': '"'}, + {'IS_PUNCT': True}, + {'IS_PUNCT': True}, + {'IS_PUNCT': True}, + {'ORTH': '"'}] + + matcher.add('Quote', '', {}, [pattern1]) + doc = get_doc(matcher.vocab, words1) + assert len(matcher(doc)) == 1 + + doc = get_doc(matcher.vocab, words2) + assert len(matcher(doc)) == 0 + matcher.add('Quote', '', {}, [pattern2]) + assert len(matcher(doc)) == 0 + + +def test_matcher_match_zero_plus(matcher): + words = 'He said , " some words " ...'.split() + pattern = [{'ORTH': '"'}, + {'OP': '*', 'IS_PUNCT': False}, + {'ORTH': '"'}] + matcher.add('Quote', '', {}, [pattern]) + doc = get_doc(matcher.vocab, words) + assert len(matcher(doc)) == 1 diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py deleted file mode 100644 index 86701bbca..000000000 --- a/spacy/tests/test_matcher.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import unicode_literals -import pytest - -from spacy.strings import StringStore -from spacy.matcher import * -from spacy.attrs import LOWER -from spacy.tokens.doc import Doc -from spacy.vocab import Vocab -from spacy.en import English - - -@pytest.fixture -def matcher(): - patterns = { - 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], - 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], - 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], - } - return Matcher(Vocab(lex_attr_getters=English.Defaults.lex_attr_getters), patterns) - - -def test_compile(matcher): - assert matcher.n_patterns == 3 - - -def test_no_match(matcher): - doc = Doc(matcher.vocab, words=['I', 'like', 'cheese', '.']) - assert matcher(doc) == [] - - -def test_match_start(matcher): - doc = Doc(matcher.vocab, words=['JavaScript', 'is', 'good']) - assert matcher(doc) == [(matcher.vocab.strings['JS'], - matcher.vocab.strings['PRODUCT'], 0, 1)] - - -def test_match_end(matcher): - doc = Doc(matcher.vocab, words=['I', 'like', 'java']) - assert matcher(doc) == [(doc.vocab.strings['Java'], - doc.vocab.strings['PRODUCT'], 2, 3)] - - -def test_match_middle(matcher): - doc = Doc(matcher.vocab, words=['I', 'like', 'Google', 'Now', 'best']) - assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], - doc.vocab.strings['PRODUCT'], 2, 4)] - - -def test_match_multi(matcher): - doc = Doc(matcher.vocab, words='I like Google Now and java best'.split()) - assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], - doc.vocab.strings['PRODUCT'], 2, 4), - (doc.vocab.strings['Java'], - doc.vocab.strings['PRODUCT'], 5, 6)] - -def test_match_zero(matcher): - matcher.add('Quote', '', {}, [ - [ - {'ORTH': '"'}, - {'OP': '!', 'IS_PUNCT': True}, - {'OP': '!', 'IS_PUNCT': True}, - {'ORTH': '"'} - ]]) - doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) - assert len(matcher(doc)) == 1 - doc = Doc(matcher.vocab, words='He said , " some three words " ...'.split()) - assert len(matcher(doc)) == 0 - matcher.add('Quote', '', {}, [ - [ - {'ORTH': '"'}, - {'IS_PUNCT': True}, - {'IS_PUNCT': True}, - {'IS_PUNCT': True}, - {'ORTH': '"'} - ]]) - assert len(matcher(doc)) == 0 - - -def test_match_zero_plus(matcher): - matcher.add('Quote', '', {}, [ - [ - {'ORTH': '"'}, - {'OP': '*', 'IS_PUNCT': False}, - {'ORTH': '"'} - ]]) - doc = Doc(matcher.vocab, words='He said , " some words " ...'.split()) - assert len(matcher(doc)) == 1 - - -def test_phrase_matcher(): - vocab = Vocab(lex_attr_getters=English.Defaults.lex_attr_getters) - matcher = PhraseMatcher(vocab, [Doc(vocab, words='Google Now'.split())]) - doc = Doc(vocab, words=['I', 'like', 'Google', 'Now', 'best']) - assert len(matcher(doc)) == 1