From 4c87a696b3229bd41a111e607ced9495b5eef604 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 4 Aug 2015 15:55:28 +0200 Subject: [PATCH] * Add draft dfa matcher, in Python. Passing tests. --- spacy/matcher.pyx | 52 +++++++++++++++++++++++++++++++++++++++++++ tests/test_matcher.py | 52 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 spacy/matcher.pyx create mode 100644 tests/test_matcher.py diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx new file mode 100644 index 000000000..c19fd8919 --- /dev/null +++ b/spacy/matcher.pyx @@ -0,0 +1,52 @@ +class MatchState(object): + def __init__(self, token_spec, ext): + self.token_spec = token_spec + self.ext = ext + self.is_final = False + + def match(self, token): + for attr, value in self.token_spec: + if getattr(token, attr) != value: + return False + else: + return True + + def __repr__(self): + return '' % (self.token_spec) + + +class EndState(object): + def __init__(self, entity_type, length): + self.entity_type = entity_type + self.length = length + self.is_final = True + + def __call__(self, token): + return (self.entity_type, ((token.i+1) - self.length), token.i+1) + + def __repr__(self): + return '' % (self.entity_type) + + +class Matcher(object): + def __init__(self, patterns): + self.start_states = [] + for token_specs, entity_type in patterns: + state = EndState(entity_type, len(token_specs)) + for spec in reversed(token_specs): + state = MatchState(spec, state) + self.start_states.append(state) + + def __call__(self, tokens): + queue = list(self.start_states) + matches = [] + for token in tokens: + next_queue = list(self.start_states) + for pattern in queue: + if pattern.match(token): + if pattern.ext.is_final: + matches.append(pattern.ext(token)) + else: + next_queue.append(pattern.ext) + queue = next_queue + return matches diff --git a/tests/test_matcher.py b/tests/test_matcher.py new file mode 100644 index 000000000..391d9526c --- /dev/null +++ b/tests/test_matcher.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals +import pytest + +from spacy.matcher import * + + +class MockToken(object): + def __init__(self, i, string): + self.i = i + self.orth_ = string + + +def make_tokens(string): + return [MockToken(i, s) for i, s in enumerate(string.split())] + + +@pytest.fixture +def matcher(): + specs = [] + for string in ['JavaScript', 'Google Now', 'Java']: + spec = tuple([[('orth_', orth)] for orth in string.split()]) + specs.append((spec, 'product')) + return Matcher(specs) + + +def test_compile(matcher): + assert len(matcher.start_states) == 3 + + +def test_no_match(matcher): + tokens = make_tokens('I like cheese') + assert matcher(tokens) == [] + + +def test_match_start(matcher): + tokens = make_tokens('JavaScript is good') + assert matcher(tokens) == [('product', 0, 1)] + + +def test_match_end(matcher): + tokens = make_tokens('I like Java') + assert matcher(tokens) == [('product', 2, 3)] + + +def test_match_middle(matcher): + tokens = make_tokens('I like Google Now best') + assert matcher(tokens) == [('product', 2, 4)] + + +def test_match_multi(matcher): + tokens = make_tokens('I like Google Now and Java best') + assert matcher(tokens) == [('product', 2, 4), ('product', 5, 6)]