from __future__ import unicode_literals import pytest from spacy.strings import StringStore from spacy.matcher import * from spacy.attrs import ORTH from spacy.tokens.doc import Doc from spacy.vocab import Vocab @pytest.fixture def matcher(EN): patterns = { 'Javascript': ['PRODUCT', {}, [{'ORTH': 'JavaScript'}]], 'GoogleNow': ['PRODUCT', {}, [{'ORTH': 'Google'}, {'ORTH': 'Now'}]], 'Java': ['PRODUCT', {}, [{'ORTH': 'Java'}]], } return Matcher(EN.vocab, patterns) def test_compile(matcher): assert matcher.n_patterns == 3 def test_no_match(matcher, EN): tokens = EN('I like cheese') assert matcher(tokens) == [] def test_match_start(matcher, EN): tokens = EN('JavaScript is good') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)] def test_match_end(matcher, EN): tokens = EN('I like Java') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] def test_match_middle(matcher, EN): tokens = EN('I like Google Now best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)] def test_match_multi(matcher, EN): tokens = EN('I like Google Now and Java best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), (EN.vocab.strings['PRODUCT'], 5, 6)] def test_match_preserved(matcher, EN): doc = EN.tokenizer('I like Java') EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 0 doc = EN.tokenizer('I like Java') matcher(doc) assert len(doc.ents) == 1 EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 1