From 7d446e509418fcff7e216bd008714b886923dc7e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 17 Oct 2016 16:49:49 +0200 Subject: [PATCH] Revert "Update matcher test, to reflect character offset return instead of token offset." This reverts commit f8d3e3bcfe4cd6543a30c7e2083e97d8de9c0f10. --- spacy/tests/test_matcher.py | 49 +++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 2ee94cb87..9f04a7a0d 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -31,32 +31,27 @@ def test_no_match(matcher): def test_match_start(matcher): doc = Doc(matcher.vocab, words=['JavaScript', 'is', 'good']) assert matcher(doc) == [(matcher.vocab.strings['JS'], - matcher.vocab.strings['PRODUCT'], 0, len('JavaScript'))] + matcher.vocab.strings['PRODUCT'], 0, 1)] def test_match_end(matcher): doc = Doc(matcher.vocab, words=['I', 'like', 'java']) assert matcher(doc) == [(doc.vocab.strings['Java'], - doc.vocab.strings['PRODUCT'], - len('I like '), len('I like java'))] + doc.vocab.strings['PRODUCT'], 2, 3)] def test_match_middle(matcher): doc = Doc(matcher.vocab, words=['I', 'like', 'Google', 'Now', 'best']) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], - doc.vocab.strings['PRODUCT'], len('I like '), - len('I like Google Now'))] + doc.vocab.strings['PRODUCT'], 2, 4)] def test_match_multi(matcher): doc = Doc(matcher.vocab, words='I like Google Now and java best'.split()) assert matcher(doc) == [(doc.vocab.strings['GoogleNow'], - doc.vocab.strings['PRODUCT'], len('I like '), - len('I like Google Now')), + doc.vocab.strings['PRODUCT'], 2, 4), (doc.vocab.strings['Java'], - doc.vocab.strings['PRODUCT'], - len('I like Google Now and '), - len('I like Google Now and java'))] + doc.vocab.strings['PRODUCT'], 5, 6)] def test_match_zero(matcher): matcher.add('Quote', '', {}, [ @@ -92,20 +87,20 @@ def test_match_zero_plus(matcher): assert len(matcher(doc)) == 1 -@pytest.mark.models -def test_match_preserved(EN): - patterns = { - 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], - 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], - 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], - } - matcher = Matcher(EN.vocab, patterns) - doc = EN.tokenizer('I like java.') - EN.tagger(doc) - assert len(doc.ents) == 0 - doc = EN.tokenizer('I like java.') - doc.ents += tuple(matcher(doc)) - assert len(doc.ents) == 1 - EN.tagger(doc) - EN.entity(doc) - assert len(doc.ents) == 1 +#@pytest.mark.models +#def test_match_preserved(EN): +# patterns = { +# 'JS': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], +# 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], +# 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], +# } +# matcher = Matcher(EN.vocab, patterns) +# doc = EN.tokenizer('I like java.') +# EN.tagger(doc) +# assert len(doc.ents) == 0 +# doc = EN.tokenizer('I like java.') +# doc.ents += tuple(matcher(doc)) +# assert len(doc.ents) == 1 +# EN.tagger(doc) +# EN.entity(doc) +# assert len(doc.ents) == 1