From 049c9375404bb333f8160a26a4e41959a613e7d3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 17 Oct 2016 15:58:57 +0200 Subject: [PATCH] Have the matcher return character offsets, to handle the match better. --- spacy/matcher.pyx | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index b6c2def8e..98772e271 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -291,8 +291,8 @@ cdef class Matcher: elif action == ACCEPT: # TODO: What to do about patterns starting with ZERO? Need to # adjust the start position. - start = state.first - end = token_i+1 + start = doc.c[state.first].idx + end = doc.c[token_i].idx + doc.c[token_i].lex.length ent_id = state.second[1].attrs[0].value label = state.second[1].attrs[1].value acceptor = self._acceptors.get(ent_id) @@ -319,12 +319,16 @@ cdef class Matcher: state.second = pattern + 1 partials.push_back(state) elif action == ACCEPT: - start = token_i - end = token_i+1 + start = token.idx + end = token.idx + token.lex.length ent_id = pattern[1].attrs[0].value label = pattern[1].attrs[1].value - if acceptor is None or acceptor(doc, ent_id, label, start, end): - matches.append((ent_id, label, start, end)) + acceptor = self._acceptors.get(ent_id) + if acceptor is not None: + match = acceptor(doc, ent_id, label, start, end) + if match: + ent_id, label, start, end = match + matches.append((ent_id, label, start, end)) for i, (ent_id, label, start, end) in enumerate(matches): on_match = self._callbacks.get(ent_id) if on_match is not None: