mirror of https://github.com/explosion/spaCy.git
* Add draft dfa matcher, in Python. Passing tests.
This commit is contained in:
parent
eb7138c761
commit
4c87a696b3
|
@ -0,0 +1,52 @@
|
||||||
|
class MatchState(object):
|
||||||
|
def __init__(self, token_spec, ext):
|
||||||
|
self.token_spec = token_spec
|
||||||
|
self.ext = ext
|
||||||
|
self.is_final = False
|
||||||
|
|
||||||
|
def match(self, token):
|
||||||
|
for attr, value in self.token_spec:
|
||||||
|
if getattr(token, attr) != value:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<spec %s>' % (self.token_spec)
|
||||||
|
|
||||||
|
|
||||||
|
class EndState(object):
|
||||||
|
def __init__(self, entity_type, length):
|
||||||
|
self.entity_type = entity_type
|
||||||
|
self.length = length
|
||||||
|
self.is_final = True
|
||||||
|
|
||||||
|
def __call__(self, token):
|
||||||
|
return (self.entity_type, ((token.i+1) - self.length), token.i+1)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return '<end %s>' % (self.entity_type)
|
||||||
|
|
||||||
|
|
||||||
|
class Matcher(object):
|
||||||
|
def __init__(self, patterns):
|
||||||
|
self.start_states = []
|
||||||
|
for token_specs, entity_type in patterns:
|
||||||
|
state = EndState(entity_type, len(token_specs))
|
||||||
|
for spec in reversed(token_specs):
|
||||||
|
state = MatchState(spec, state)
|
||||||
|
self.start_states.append(state)
|
||||||
|
|
||||||
|
def __call__(self, tokens):
|
||||||
|
queue = list(self.start_states)
|
||||||
|
matches = []
|
||||||
|
for token in tokens:
|
||||||
|
next_queue = list(self.start_states)
|
||||||
|
for pattern in queue:
|
||||||
|
if pattern.match(token):
|
||||||
|
if pattern.ext.is_final:
|
||||||
|
matches.append(pattern.ext(token))
|
||||||
|
else:
|
||||||
|
next_queue.append(pattern.ext)
|
||||||
|
queue = next_queue
|
||||||
|
return matches
|
|
@ -0,0 +1,52 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from spacy.matcher import *
|
||||||
|
|
||||||
|
|
||||||
|
class MockToken(object):
|
||||||
|
def __init__(self, i, string):
|
||||||
|
self.i = i
|
||||||
|
self.orth_ = string
|
||||||
|
|
||||||
|
|
||||||
|
def make_tokens(string):
|
||||||
|
return [MockToken(i, s) for i, s in enumerate(string.split())]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def matcher():
|
||||||
|
specs = []
|
||||||
|
for string in ['JavaScript', 'Google Now', 'Java']:
|
||||||
|
spec = tuple([[('orth_', orth)] for orth in string.split()])
|
||||||
|
specs.append((spec, 'product'))
|
||||||
|
return Matcher(specs)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compile(matcher):
|
||||||
|
assert len(matcher.start_states) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_match(matcher):
|
||||||
|
tokens = make_tokens('I like cheese')
|
||||||
|
assert matcher(tokens) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_start(matcher):
|
||||||
|
tokens = make_tokens('JavaScript is good')
|
||||||
|
assert matcher(tokens) == [('product', 0, 1)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_end(matcher):
|
||||||
|
tokens = make_tokens('I like Java')
|
||||||
|
assert matcher(tokens) == [('product', 2, 3)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_middle(matcher):
|
||||||
|
tokens = make_tokens('I like Google Now best')
|
||||||
|
assert matcher(tokens) == [('product', 2, 4)]
|
||||||
|
|
||||||
|
|
||||||
|
def test_match_multi(matcher):
|
||||||
|
tokens = make_tokens('I like Google Now and Java best')
|
||||||
|
assert matcher(tokens) == [('product', 2, 4), ('product', 5, 6)]
|
Loading…
Reference in New Issue