spaCy/spacy/matcher.pyx

# cython: profile=True
# cython: infer_types=True
# coding: utf8
from __future__ import unicode_literals

import ujson

from .typedefs cimport attr_t
from .typedefs cimport hash_t
from .attrs cimport attr_id_t
from .structs cimport TokenC

from cymem.cymem cimport Pool
from preshed.maps cimport PreshMap
from libcpp.vector cimport vector
from libcpp.pair cimport pair
from murmurhash.mrmr cimport hash64
from libc.stdint cimport int32_t

from .attrs cimport ID, ENT_TYPE
from . import attrs
from .tokens.doc cimport get_token_attr
from .tokens.doc cimport Doc
from .vocab cimport Vocab

from .attrs import FLAG61 as U_ENT

from .attrs import FLAG60 as B2_ENT
from .attrs import FLAG59 as B3_ENT
from .attrs import FLAG58 as B4_ENT
from .attrs import FLAG57 as B5_ENT
from .attrs import FLAG56 as B6_ENT
from .attrs import FLAG55 as B7_ENT
from .attrs import FLAG54 as B8_ENT
from .attrs import FLAG53 as B9_ENT
from .attrs import FLAG52 as B10_ENT

from .attrs import FLAG51 as I3_ENT
from .attrs import FLAG50 as I4_ENT
from .attrs import FLAG49 as I5_ENT
from .attrs import FLAG48 as I6_ENT
from .attrs import FLAG47 as I7_ENT
from .attrs import FLAG46 as I8_ENT
from .attrs import FLAG45 as I9_ENT
from .attrs import FLAG44 as I10_ENT

from .attrs import FLAG43 as L2_ENT
from .attrs import FLAG42 as L3_ENT
from .attrs import FLAG41 as L4_ENT
from .attrs import FLAG40 as L5_ENT
from .attrs import FLAG39 as L6_ENT
from .attrs import FLAG38 as L7_ENT
from .attrs import FLAG37 as L8_ENT
from .attrs import FLAG36 as L9_ENT
from .attrs import FLAG35 as L10_ENT


cpdef enum quantifier_t:
    _META
    ONE
    ZERO
    ZERO_ONE
    ZERO_PLUS


cdef enum action_t:
    REJECT
    ADVANCE
    REPEAT
    ACCEPT
    ADVANCE_ZERO
    PANIC


cdef struct AttrValueC:
    attr_id_t attr
    attr_t value


cdef struct TokenPatternC:
    AttrValueC* attrs
    int32_t nr_attr
    quantifier_t quantifier


ctypedef TokenPatternC* TokenPatternC_ptr
ctypedef pair[int, TokenPatternC_ptr] StateC


cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
                                 object token_specs) except NULL:
    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
    cdef int i
    for i, (quantifier, spec) in enumerate(token_specs):
        pattern[i].quantifier = quantifier
        pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
        pattern[i].nr_attr = len(spec)
        for j, (attr, value) in enumerate(spec):
            pattern[i].attrs[j].attr = attr
            pattern[i].attrs[j].value = value
    i = len(token_specs)
    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
    pattern[i].attrs[0].attr = ID
    pattern[i].attrs[0].value = entity_id
    pattern[i].nr_attr = 0
    return pattern


cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
    while pattern.nr_attr != 0:
        pattern += 1
    id_attr = pattern[0].attrs[0]
    assert id_attr.attr == ID
    return id_attr.value


cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
    for attr in pattern.attrs[:pattern.nr_attr]:
        if get_token_attr(token, attr.attr) != attr.value:
            if pattern.quantifier == ONE:
                return REJECT
            elif pattern.quantifier == ZERO:
                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO
            else:
                return PANIC
    if pattern.quantifier == ZERO:
        return REJECT
    elif pattern.quantifier in (ONE, ZERO_ONE):
        return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
    elif pattern.quantifier == ZERO_PLUS:
        return REPEAT
    else:
        return PANIC


def _convert_strings(token_specs, string_store):
    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
            '?': (ZERO_ONE,), '1': (ONE,)}
    tokens = []
    op = ONE
    for spec in token_specs:
        token = []
        ops = (ONE,)
        for attr, value in spec.items():
            if isinstance(attr, basestring) and attr.upper() == 'OP':
                if value in operators:
                    ops = operators[value]
                else:
                    raise KeyError(
                        "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
            if isinstance(attr, basestring):
                attr = attrs.IDS.get(attr.upper())
            if isinstance(value, basestring):
                value = string_store[value]
            if isinstance(value, bool):
                value = int(value)
            if attr is not None:
                token.append((attr, value))
        for op in ops:
            tokens.append((op, token))
    return tokens


def merge_phrase(matcher, doc, i, matches):
    """Callback to merge a phrase on match."""
    ent_id, label, start, end = matches[i]
    span = doc[start : end]
    span.merge(ent_type=label, ent_id=ent_id)


cdef class Matcher:
    """Match sequences of tokens, based on pattern rules."""
    cdef Pool mem
    cdef vector[TokenPatternC*] patterns
    cdef readonly Vocab vocab
    cdef public object _patterns
    cdef public object _entities
    cdef public object _callbacks
    cdef public object _acceptors

    def __init__(self, vocab):
        """Create the Matcher.

        vocab (Vocab): The vocabulary object, which must be shared with the
            documents the matcher will operate on.
        RETURNS (Matcher): The newly constructed object.
        """
        self._patterns = {}
        self._entities = {}
        self._acceptors = {}
        self._callbacks = {}
        self.vocab = vocab
        self.mem = Pool()

    def __reduce__(self):
        return (self.__class__, (self.vocab, self._patterns), None, None)

    def __len__(self):
        """Get the number of rules added to the matcher.

        RETURNS (int): The number of rules.
        """
        return len(self._patterns)

    def __contains__(self, key):
        """Check whether the matcher contains rules for a match ID.

        key (unicode): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
        return len(self._patterns)

    def add(self, key, on_match, *patterns):
        """Add a match-rule to the matcher.
        A match-rule consists of: an ID key, an on_match callback, and one or
        more patterns. If the key exists, the patterns are appended to the
        previous ones, and the previous on_match callback is replaced. The
        `on_match` callback will receive the arguments `(matcher, doc, i,
        matches)`. You can also set `on_match` to `None` to not perform any
        actions. A pattern consists of one or more `token_specs`, where a
        `token_spec` is a dictionary mapping attribute IDs to values. Token
        descriptors can also include quantifiers. There are currently important
        known problems with the quantifiers – see the docs.
        """
        for pattern in patterns:
            if len(pattern) == 0:
                msg = ("Cannot add pattern for zero tokens to matcher.\n"
                       "key: {key}\n")
                raise ValueError(msg.format(key=key))
        key = self._normalize_key(key)
        self._patterns.setdefault(key, [])
        self._callbacks[key] = on_match

        for pattern in patterns:
            specs = _convert_strings(pattern, self.vocab.strings)
            self.patterns.push_back(init_pattern(self.mem, key, specs))
            self._patterns[key].append(specs)

    def remove(self, key):
        """Remove a rule from the matcher. A KeyError is raised if the key does
        not exist.

        key (unicode): The ID of the match rule.
        """
        key = self._normalize_key(key)
        self._patterns.pop(key)
        self._callbacks.pop(key)
        cdef int i = 0
        while i < self.patterns.size():
            pattern_key = get_pattern_key(self.patterns.at(i))
            if pattern_key == key:
                self.patterns.erase(self.patterns.begin()+i)
            else:
                i += 1

    def has_key(self, key):
        """Check whether the matcher has a rule with a given key.

        key (string or int): The key to check.
        RETURNS (bool): Whether the matcher has the rule.
        """
        key = self._normalize_key(key)
        return key in self._patterns

    def get(self, key, default=None):
        """Retrieve the pattern stored for an entity.

        key (unicode or int): The key to retrieve.
        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
        """
        key = self._normalize_key(key)
        if key not in self._patterns:
            return default
        return (self._callbacks[key], self._patterns[key])

    def pipe(self, docs, batch_size=1000, n_threads=2):
        """Match a stream of documents, yielding them in turn.

        docs (iterable): A stream of documents.
        batch_size (int): The number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
            in parallel, if the `Matcher` implementation supports multi-threading.
        YIELDS (Doc): Documents, in order.
        """
        for doc in docs:
            self(doc)
            yield doc

    def __call__(self, Doc doc):
        """Find all token sequences matching the supplied patterns on the `Doc`.

        doc (Doc): The document to match over.
        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
            describing the matches. A match tuple describes a span
            `doc[start:end]`. The `label_id` and `key` are both integers.
        """
        cdef vector[StateC] partials
        cdef int n_partials = 0
        cdef int q = 0
        cdef int i, token_i
        cdef const TokenC* token
        cdef StateC state
        matches = []
        for token_i in range(doc.length):
            token = &doc.c[token_i]
            q = 0
            # Go over the open matches, extending or finalizing if able. Otherwise,
            # we over-write them (q doesn't advance)
            for state in partials:
                action = get_action(state.second, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
                    state.second += 1
                    action = get_action(state.second, token)
                if action == REPEAT:
                    # Leave the state in the queue, and advance to next slot
                    # (i.e. we don't overwrite -- we want to greedily match more
                    # pattern.
                    q += 1
                elif action == REJECT:
                    pass
                elif action == ADVANCE:
                    partials[q] = state
                    partials[q].second += 1
                    q += 1
                elif action == ACCEPT:
                    # TODO: What to do about patterns starting with ZERO? Need to
                    # adjust the start position.
                    start = state.first
                    end = token_i+1
                    ent_id = state.second[1].attrs[0].value
                    label = state.second[1].attrs[1].value
                    matches.append((ent_id, start, end))
            partials.resize(q)
            # Check whether we open any new patterns on this token
            for pattern in self.patterns:
                action = get_action(pattern, token)
                if action == PANIC:
                    raise Exception("Error selecting action in matcher")
                while action == ADVANCE_ZERO:
                    pattern += 1
                    action = get_action(pattern, token)
                if action == REPEAT:
                    state.first = token_i
                    state.second = pattern
                    partials.push_back(state)
                elif action == ADVANCE:
                    # TODO: What to do about patterns starting with ZERO? Need to
                    # adjust the start position.
                    state.first = token_i
                    state.second = pattern + 1
                    partials.push_back(state)
                elif action == ACCEPT:
                    start = token_i
                    end = token_i+1
                    ent_id = pattern[1].attrs[0].value
                    label = pattern[1].attrs[1].value
                    matches.append((ent_id, start, end))
        # Look for open patterns that are actually satisfied
        for state in partials:
            while state.second.quantifier in (ZERO, ZERO_PLUS):
                state.second += 1
                if state.second.nr_attr == 0:
                    start = state.first
                    end = len(doc)
                    ent_id = state.second.attrs[0].value
                    label = state.second.attrs[0].value
                    matches.append((ent_id, start, end))
        for i, (ent_id, label, start, end) in enumerate(matches):
            on_match = self._callbacks.get(ent_id)
            if on_match is not None:
                on_match(self, doc, i, matches)
        # TODO: only return (match_id, start, end)
        return matches

    def _normalize_key(self, key):
        if isinstance(key, basestring):
            return self.vocab.strings[key]
        else:
            return key


def get_bilou(length):
    if length == 1:
        return [U_ENT]
    elif length == 2:
        return [B2_ENT, L2_ENT]
    elif length == 3:
        return [B3_ENT, I3_ENT, L3_ENT]
    elif length == 4:
        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
    elif length == 5:
        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
    elif length == 6:
        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
    elif length == 7:
        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
    elif length == 8:
        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
    elif length == 9:
        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
    elif length == 10:
        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
                I10_ENT, I10_ENT, L10_ENT]
    else:
        raise ValueError("Max length currently 10 for phrase matching")


cdef class PhraseMatcher:
    cdef Pool mem
    cdef Vocab vocab
    cdef Matcher matcher
    cdef PreshMap phrase_ids

    cdef int max_length
    cdef attr_t* _phrase_key

    def __init__(self, Vocab vocab, phrases, max_length=10):
        self.mem = Pool()
        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
        self.max_length = max_length
        self.vocab = vocab
        self.matcher = Matcher(self.vocab, {})
        self.phrase_ids = PreshMap()
        for phrase in phrases:
            if len(phrase) < max_length:
                self.add(phrase)

        abstract_patterns = []
        for length in range(1, max_length):
            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
        self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match)

    def add(self, Doc tokens):
        cdef int length = tokens.length
        assert length < self.max_length
        tags = get_bilou(length)
        assert len(tags) == length, length

        cdef int i
        for i in range(self.max_length):
            self._phrase_key[i] = 0
        for i, tag in enumerate(tags):
            lexeme = self.vocab[tokens.c[i].lex.orth]
            lexeme.set_flag(tag, True)
            self._phrase_key[i] = lexeme.orth
        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
        self.phrase_ids[key] = True

    def __call__(self, Doc doc):
        matches = []
        for ent_id, label, start, end in self.matcher(doc):
            cand = doc[start : end]
            start = cand[0].idx
            end = cand[-1].idx + len(cand[-1])
            matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
        for match in matches:
            doc.merge(*match)
        return matches

    def pipe(self, stream, batch_size=1000, n_threads=2):
        for doc in stream:
            self(doc)
            yield doc

    def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
        assert (end - start) < self.max_length
        cdef int i, j
        for i in range(self.max_length):
            self._phrase_key[i] = 0
        for i, j in enumerate(range(start, end)):
            self._phrase_key[i] = doc.c[j].lex.orth
        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
        if self.phrase_ids.get(key):
            return (ent_id, label, start, end)
        else:
            return False
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								# cython: profile=True
-												Fix Issue #587: Segfault in Matcher, due to simple error in the state machine.

											
										
										
											2016-10-28 15:42:00 +00:00
+								# cython: infer_types=True
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 10:05:47 +00:00
+								# coding: utf8
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								from __future__ import unicode_literals
-												Fix json imports and use ujson

											
										
										
											2017-04-15 10:13:34 +00:00
+								import ujson
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								from .typedefs cimport attr_t
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								from .typedefs cimport hash_t
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								from .attrs cimport attr_id_t
-												Remove unused import statements


											
										
										
											2017-03-21 20:08:54 +00:00
+								from .structs cimport TokenC
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								from cymem.cymem cimport Pool
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								from preshed.maps cimport PreshMap
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								from libcpp.vector cimport vector
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								from libcpp.pair cimport pair
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								from murmurhash.mrmr cimport hash64
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								from libc.stdint cimport int32_t
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
-												Remove unused import statements


											
										
										
											2017-03-21 20:08:54 +00:00
+								from .attrs cimport ID, ENT_TYPE
-												* Make Matcher use attrs from the attrs.pyx file, rather than having an incomplete function doing the mapping.

											
										
										
											2016-04-14 08:37:39 +00:00
+								from . import attrs
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								from .tokens.doc cimport get_token_attr
 								from .tokens.doc cimport Doc
 								from .vocab cimport Vocab
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								from .attrs import FLAG61 as U_ENT
 								from .attrs import FLAG60 as B2_ENT
 								from .attrs import FLAG59 as B3_ENT
 								from .attrs import FLAG58 as B4_ENT
 								from .attrs import FLAG57 as B5_ENT
 								from .attrs import FLAG56 as B6_ENT
 								from .attrs import FLAG55 as B7_ENT
 								from .attrs import FLAG54 as B8_ENT
 								from .attrs import FLAG53 as B9_ENT
 								from .attrs import FLAG52 as B10_ENT
 								from .attrs import FLAG51 as I3_ENT
 								from .attrs import FLAG50 as I4_ENT
 								from .attrs import FLAG49 as I5_ENT
 								from .attrs import FLAG48 as I6_ENT
 								from .attrs import FLAG47 as I7_ENT
 								from .attrs import FLAG46 as I8_ENT
 								from .attrs import FLAG45 as I9_ENT
 								from .attrs import FLAG44 as I10_ENT
 								from .attrs import FLAG43 as L2_ENT
 								from .attrs import FLAG42 as L3_ENT
 								from .attrs import FLAG41 as L4_ENT
 								from .attrs import FLAG40 as L5_ENT
 								from .attrs import FLAG39 as L6_ENT
 								from .attrs import FLAG38 as L7_ENT
 								from .attrs import FLAG37 as L8_ENT
 								from .attrs import FLAG36 as L9_ENT
 								from .attrs import FLAG35 as L10_ENT
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								cpdef enum quantifier_t:
 								    _META
 								    ONE
 								    ZERO
 								    ZERO_ONE
 								    ZERO_PLUS
 								cdef enum action_t:
 								    REJECT
 								    ADVANCE
 								    REPEAT
 								    ACCEPT
 								    ADVANCE_ZERO
 								    PANIC
 								cdef struct AttrValueC:
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								    attr_id_t attr
 								    attr_t value
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								cdef struct TokenPatternC:
 								    AttrValueC* attrs
 								    int32_t nr_attr
 								    quantifier_t quantifier
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								ctypedef TokenPatternC* TokenPatternC_ptr
 								ctypedef pair[int, TokenPatternC_ptr] StateC
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id,
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
+								                                 object token_specs) except NULL:
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								    pattern = <TokenPatternC*>mem.alloc(len(token_specs) + 1, sizeof(TokenPatternC))
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								    cdef int i
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								    for i, (quantifier, spec) in enumerate(token_specs):
 								        pattern[i].quantifier = quantifier
 								        pattern[i].attrs = <AttrValueC*>mem.alloc(len(spec), sizeof(AttrValueC))
 								        pattern[i].nr_attr = len(spec)
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								        for j, (attr, value) in enumerate(spec):
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								            pattern[i].attrs[j].attr = attr
 								            pattern[i].attrs[j].value = value
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								    i = len(token_specs)
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    pattern[i].attrs = <AttrValueC*>mem.alloc(2, sizeof(AttrValueC))
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								    pattern[i].attrs[0].attr = ID
 								    pattern[i].attrs[0].value = entity_id
 								    pattern[i].nr_attr = 0
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								    return pattern
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
 								    while pattern.nr_attr != 0:
 								        pattern += 1
 								    id_attr = pattern[0].attrs[0]
 								    assert id_attr.attr == ID
 								    return id_attr.value
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
 								    for attr in pattern.attrs[:pattern.nr_attr]:
 								        if get_token_attr(token, attr.attr) != attr.value:
 								            if pattern.quantifier == ONE:
 								                return REJECT
 								            elif pattern.quantifier == ZERO:
 								                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
 								            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
 								                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO
 								            else:
 								                return PANIC
 								    if pattern.quantifier == ZERO:
 								        return REJECT
 								    elif pattern.quantifier in (ONE, ZERO_ONE):
 								        return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
 								    elif pattern.quantifier == ZERO_PLUS:
 								        return REPEAT
 								    else:
 								        return PANIC
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
-												* Work on gazetteer matching

											
										
										
											2015-08-06 12:33:21 +00:00
+								def _convert_strings(token_specs, string_store):
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								    # Support 'syntactic sugar' operator '+', as combination of ONE, ZERO_PLUS
 								    operators = {'!': (ZERO,), '*': (ZERO_PLUS,), '+': (ONE, ZERO_PLUS),
-												Add 1 operator to matcher, and make sure open patterns are closed at end of document. Closes Issue #766

											
										
										
											2017-02-24 13:27:02 +00:00
+								            '?': (ZERO_ONE,), '1': (ONE,)}
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								    tokens = []
 								    op = ONE
-												* Work on gazetteer matching

											
										
										
											2015-08-06 12:33:21 +00:00
+								    for spec in token_specs:
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
+								        token = []
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								        ops = (ONE,)
-												* Work on gazetteer matching

											
										
										
											2015-08-06 12:33:21 +00:00
+								        for attr, value in spec.items():
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								            if isinstance(attr, basestring) and attr.upper() == 'OP':
 								                if value in operators:
 								                    ops = operators[value]
 								                else:
 								                    raise KeyError(
-												Add 1 operator to matcher, and make sure open patterns are closed at end of document. Closes Issue #766

											
										
										
											2017-02-24 13:27:02 +00:00
+								                        "Unknown operator '%s'. Options: %s" % (value, ', '.join(operators.keys())))
-												* Work on gazetteer matching

											
										
										
											2015-08-06 12:33:21 +00:00
+								            if isinstance(attr, basestring):
-												bugfix: uppercase attr values before looking them up

											
										
										
											2016-04-15 13:46:31 +00:00
+								                attr = attrs.IDS.get(attr.upper())
-												* Work on gazetteer matching

											
										
										
											2015-08-06 12:33:21 +00:00
+								            if isinstance(value, basestring):
-												Revert "Changes to matcher.pyx for new StringStore scheme"

This reverts commit 3ff09614e0f6fdf4a9badb0ddc75e644df892340.

											
										
										
											2016-09-30 18:20:13 +00:00
+								                value = string_store[value]
-												* Temporarily import flag attributes in matcher

											
										
										
											2015-09-06 15:53:12 +00:00
+								            if isinstance(value, bool):
 								                value = int(value)
-												* Make Matcher use attrs from the attrs.pyx file, rather than having an incomplete function doing the mapping.

											
										
										
											2016-04-14 08:37:39 +00:00
+								            if attr is not None:
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                token.append((attr, value))
 								        for op in ops:
 								            tokens.append((op, token))
 								    return tokens
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
-												Add merge_phrase callback in matcher.pyx

											
										
										
											2017-03-31 11:58:59 +00:00
+								def merge_phrase(matcher, doc, i, matches):
-												Update Matcher docstrings and API docs

											
										
										
											2017-05-19 19:47:06 +00:00
+								    """Callback to merge a phrase on match."""
-												Add merge_phrase callback in matcher.pyx

											
										
										
											2017-03-31 11:58:59 +00:00
+								    ent_id, label, start, end = matches[i]
-												Clean up imports, unused code, whitespace, docstrings

											
										
										
											2017-04-15 10:05:47 +00:00
+								    span = doc[start : end]
-												Add merge_phrase callback in matcher.pyx

											
										
										
											2017-03-31 11:58:59 +00:00
+								    span.merge(ent_type=label, ent_id=ent_id)
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								cdef class Matcher:
-												Update Matcher docstrings and API docs

											
										
										
											2017-05-19 19:47:06 +00:00
+								    """Match sequences of tokens, based on pattern rules."""
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								    cdef Pool mem
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								    cdef vector[TokenPatternC*] patterns
-												* Fix missing n_patterns property in Matcher class. Fix from_dir method

											
										
										
											2015-08-26 17:17:02 +00:00
+								    cdef readonly Vocab vocab
-												Expose the _patterns private member

											
										
										
											2016-09-24 09:20:42 +00:00
+								    cdef public object _patterns
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
+								    cdef public object _entities
 								    cdef public object _callbacks
 								    cdef public object _acceptors
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    def __init__(self, vocab):
-												Update Matcher docstrings and API docs

											
										
										
											2017-05-19 19:47:06 +00:00
+								        """Create the Matcher.
 								        vocab (Vocab): The vocabulary object, which must be shared with the
 								            documents the matcher will operate on.
 								        RETURNS (Matcher): The newly constructed object.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
+								        """
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
+								        self._patterns = {}
 								        self._entities = {}
 								        self._acceptors = {}
 								        self._callbacks = {}
-												* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue #125: allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve.

											
										
										
											2015-10-12 08:33:11 +00:00
+								        self.vocab = vocab
 								        self.mem = Pool()
-												Revert "Don't try to pickle matcher."

This reverts commit 97bd0c9d00a6480a46ba64a5b8b40cc0b2074644.

											
										
										
											2016-10-17 14:49:43 +00:00
+								    def __reduce__(self):
 								        return (self.__class__, (self.vocab, self._patterns), None, None)
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    def __len__(self):
-												Update docstrings and API docs for Matcher

											
										
										
											2017-05-20 12:26:10 +00:00
+								        """Get the number of rules added to the matcher.
 								        RETURNS (int): The number of rules.
 								        """
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        return len(self._patterns)
 								    def __contains__(self, key):
-												Update docstrings and API docs for Matcher

											
										
										
											2017-05-20 12:26:10 +00:00
+								        """Check whether the matcher contains rules for a match ID.
 								        key (unicode): The match ID.
 								        RETURNS (bool): Whether the matcher contains rules for this match ID.
 								        """
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        return len(self._patterns)
 								    def add(self, key, on_match, *patterns):
 								        """Add a match-rule to the matcher.
-												Update docstrings

											
										
										
											2017-05-20 12:05:07 +00:00
+								        A match-rule consists of: an ID key, an on_match callback, and one or
 								        more patterns. If the key exists, the patterns are appended to the
 								        previous ones, and the previous on_match callback is replaced. The
-												Update docstrings and API docs for Matcher

											
										
										
											2017-05-20 12:26:10 +00:00
+								        `on_match` callback will receive the arguments `(matcher, doc, i,
 								        matches)`. You can also set `on_match` to `None` to not perform any
 								        actions. A pattern consists of one or more `token_specs`, where a
-												Update docstrings

											
										
										
											2017-05-20 12:05:07 +00:00
+								        `token_spec` is a dictionary mapping attribute IDs to values. Token
 								        descriptors can also include quantifiers. There are currently important
 								        known problems with the quantifiers – see the docs.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
+								        """
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        for pattern in patterns:
 								            if len(pattern) == 0:
 								                msg = ("Cannot add pattern for zero tokens to matcher.\n"
 								                       "key: {key}\n")
 								                raise ValueError(msg.format(key=key))
 								        key = self._normalize_key(key)
 								        self._patterns.setdefault(key, [])
 								        self._callbacks[key] = on_match
 								        for pattern in patterns:
 								            specs = _convert_strings(pattern, self.vocab.strings)
 								            self.patterns.push_back(init_pattern(self.mem, key, specs))
 								            self._patterns[key].append(specs)
 								    def remove(self, key):
-												Update docstrings and API docs for Matcher

											
										
										
											2017-05-20 12:26:10 +00:00
+								        """Remove a rule from the matcher. A KeyError is raised if the key does
 								        not exist.
 								        key (unicode): The ID of the match rule.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
+								        """
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        key = self._normalize_key(key)
 								        self._patterns.pop(key)
 								        self._callbacks.pop(key)
 								        cdef int i = 0
 								        while i < self.patterns.size():
 								            pattern_key = get_pattern_key(self.patterns.at(i))
 								            if pattern_key == key:
 								                self.patterns.erase(self.patterns.begin()+i)
 								            else:
 								                i += 1
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    def has_key(self, key):
 								        """Check whether the matcher has a rule with a given key.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        key (string or int): The key to check.
 								        RETURNS (bool): Whether the matcher has the rule.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
+								        """
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        key = self._normalize_key(key)
 								        return key in self._patterns
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    def get(self, key, default=None):
 								        """Retrieve the pattern stored for an entity.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        key (unicode or int): The key to retrieve.
 								        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
+								        """
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        key = self._normalize_key(key)
 								        if key not in self._patterns:
 								            return default
 								        return (self._callbacks[key], self._patterns[key])
 								    def pipe(self, docs, batch_size=1000, n_threads=2):
 								        """Match a stream of documents, yielding them in turn.
 								        docs (iterable): A stream of documents.
 								        batch_size (int): The number of documents to accumulate into a working set.
 								        n_threads (int): The number of threads with which to work on the buffer
 								            in parallel, if the `Matcher` implementation supports multi-threading.
 								        YIELDS (Doc): Documents, in order.
 								        """
 								        for doc in docs:
 								            self(doc)
 								            yield doc
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    def __call__(self, Doc doc):
-												Update Matcher docstrings and API docs

											
										
										
											2017-05-19 19:47:06 +00:00
+								        """Find all token sequences matching the supplied patterns on the `Doc`.
 								        doc (Doc): The document to match over.
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								        RETURNS (list): A list of `(key, label_id, start, end)` tuples,
-												Update Matcher docstrings and API docs

											
										
										
											2017-05-19 19:47:06 +00:00
+								            describing the matches. A match tuple describes a span
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								            `doc[start:end]`. The `label_id` and `key` are both integers.
-												Fix doc strings

											
										
										
											2016-11-01 11:25:36 +00:00
+								        """
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								        cdef vector[StateC] partials
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								        cdef int n_partials = 0
 								        cdef int q = 0
 								        cdef int i, token_i
 								        cdef const TokenC* token
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								        cdef StateC state
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
+								        matches = []
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								        for token_i in range(doc.length):
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 13:15:14 +00:00
+								            token = &doc.c[token_i]
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								            q = 0
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								            # Go over the open matches, extending or finalizing if able. Otherwise,
 								            # we over-write them (q doesn't advance)
-												* Clean up C++ usage in spacy/matcher.pyx

											
										
										
											2015-10-18 06:20:50 +00:00
+								            for state in partials:
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                action = get_action(state.second, token)
-												Fix Issue #587: Segfault in Matcher, due to simple error in the state machine.

											
										
										
											2016-10-28 15:42:00 +00:00
+								                if action == PANIC:
 								                    raise Exception("Error selecting action in matcher")
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                while action == ADVANCE_ZERO:
 								                    state.second += 1
 								                    action = get_action(state.second, token)
 								                if action == REPEAT:
 								                    # Leave the state in the queue, and advance to next slot
 								                    # (i.e. we don't overwrite -- we want to greedily match more
 								                    # pattern.
 								                    q += 1
 								                elif action == REJECT:
 								                    pass
 								                elif action == ADVANCE:
-												Fix Issue #587: Segfault in Matcher, due to simple error in the state machine.

											
										
										
											2016-10-28 15:42:00 +00:00
+								                    partials[q] = state
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                    partials[q].second += 1
 								                    q += 1
 								                elif action == ACCEPT:
 								                    # TODO: What to do about patterns starting with ZERO? Need to
 								                    # adjust the start position.
-												Revert "Have the matcher return character offsets, to handle the match better."

This reverts commit 049c9375404bb333f8160a26a4e41959a613e7d3.

											
										
										
											2016-10-17 14:49:51 +00:00
+								                    start = state.first
 								                    end = token_i+1
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                    ent_id = state.second[1].attrs[0].value
 								                    label = state.second[1].attrs[1].value
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								                    matches.append((ent_id, start, end))
-												* Reimplement matching in Cython, instead of Python.

											
										
										
											2015-08-04 23:05:54 +00:00
+								            partials.resize(q)
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								            # Check whether we open any new patterns on this token
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								            for pattern in self.patterns:
 								                action = get_action(pattern, token)
-												Fix Issue #587: Segfault in Matcher, due to simple error in the state machine.

											
										
										
											2016-10-28 15:42:00 +00:00
+								                if action == PANIC:
 								                    raise Exception("Error selecting action in matcher")
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                while action == ADVANCE_ZERO:
 								                    pattern += 1
 								                    action = get_action(pattern, token)
 								                if action == REPEAT:
 								                    state.first = token_i
 								                    state.second = pattern
 								                    partials.push_back(state)
 								                elif action == ADVANCE:
 								                    # TODO: What to do about patterns starting with ZERO? Need to
 								                    # adjust the start position.
 								                    state.first = token_i
 								                    state.second = pattern + 1
 								                    partials.push_back(state)
 								                elif action == ACCEPT:
-												Revert "Have the matcher return character offsets, to handle the match better."

This reverts commit 049c9375404bb333f8160a26a4e41959a613e7d3.

											
										
										
											2016-10-17 14:49:51 +00:00
+								                    start = token_i
 								                    end = token_i+1
-												Initial, limited support for quantified patterns in Matcher, and tracking of ent_id attribute in Token and Span. The quantifiers need a lot more testing, and there are some known problems. The main known problem is that the zero-plus and one-plus quantifiers won't work if a token can match both the quantified pattern expression AND the tail of the match.

											
										
										
											2016-09-21 12:54:55 +00:00
+								                    ent_id = pattern[1].attrs[0].value
 								                    label = pattern[1].attrs[1].value
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								                    matches.append((ent_id, start, end))
-												Add 1 operator to matcher, and make sure open patterns are closed at end of document. Closes Issue #766

											
										
										
											2017-02-24 13:27:02 +00:00
+								        # Look for open patterns that are actually satisfied
 								        for state in partials:
 								            while state.second.quantifier in (ZERO, ZERO_PLUS):
 								                state.second += 1
 								                if state.second.nr_attr == 0:
 								                    start = state.first
 								                    end = len(doc)
 								                    ent_id = state.second.attrs[0].value
 								                    label = state.second.attrs[0].value
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								                    matches.append((ent_id, start, end))
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
+								        for i, (ent_id, label, start, end) in enumerate(matches):
 								            on_match = self._callbacks.get(ent_id)
 								            if on_match is not None:
 								                on_match(self, doc, i, matches)
-												Add TODO

											
										
										
											2017-05-19 23:38:04 +00:00
+								        # TODO: only return (match_id, start, end)
-												* Add draft dfa matcher, in Python. Passing tests.

											
										
										
											2015-08-04 13:55:28 +00:00
+								        return matches
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
-												Update Matcher API

											
										
										
											2017-05-20 11:54:53 +00:00
+								    def _normalize_key(self, key):
 								        if isinstance(key, basestring):
 								            return self.vocab.strings[key]
 								        else:
 								            return key
-												* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading.

											
										
										
											2016-02-03 01:04:55 +00:00
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
-												Lots of updates to Matcher, to make entity handling sane.

											
										
										
											2016-10-17 13:23:31 +00:00
+								def get_bilou(length):
 								    if length == 1:
 								        return [U_ENT]
 								    elif length == 2:
 								        return [B2_ENT, L2_ENT]
 								    elif length == 3:
 								        return [B3_ENT, I3_ENT, L3_ENT]
 								    elif length == 4:
 								        return [B4_ENT, I4_ENT, I4_ENT, L4_ENT]
 								    elif length == 5:
 								        return [B5_ENT, I5_ENT, I5_ENT, I5_ENT, L5_ENT]
 								    elif length == 6:
 								        return [B6_ENT, I6_ENT, I6_ENT, I6_ENT, I6_ENT, L6_ENT]
 								    elif length == 7:
 								        return [B7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, I7_ENT, L7_ENT]
 								    elif length == 8:
 								        return [B8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, I8_ENT, L8_ENT]
 								    elif length == 9:
 								        return [B9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, I9_ENT, L9_ENT]
 								    elif length == 10:
 								        return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
 								                I10_ENT, I10_ENT, L10_ENT]
 								    else:
 								        raise ValueError("Max length currently 10 for phrase matching")
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								cdef class PhraseMatcher:
 								    cdef Pool mem
 								    cdef Vocab vocab
 								    cdef Matcher matcher
 								    cdef PreshMap phrase_ids
 								    cdef int max_length
 								    cdef attr_t* _phrase_key
 								    def __init__(self, Vocab vocab, phrases, max_length=10):
 								        self.mem = Pool()
 								        self._phrase_key = <attr_t*>self.mem.alloc(max_length, sizeof(attr_t))
 								        self.max_length = max_length
 								        self.vocab = vocab
 								        self.matcher = Matcher(self.vocab, {})
 								        self.phrase_ids = PreshMap()
 								        for phrase in phrases:
 								            if len(phrase) < max_length:
 								                self.add(phrase)
 								        abstract_patterns = []
 								        for length in range(1, max_length):
 								            abstract_patterns.append([{tag: True} for tag in get_bilou(length)])
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
+								        self.matcher.add('Candidate', 'MWE', {}, abstract_patterns, acceptor=self.accept_match)
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
 								    def add(self, Doc tokens):
 								        cdef int length = tokens.length
 								        assert length < self.max_length
 								        tags = get_bilou(length)
 								        assert len(tags) == length, length
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								        cdef int i
 								        for i in range(self.max_length):
 								            self._phrase_key[i] = 0
 								        for i, tag in enumerate(tags):
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 13:15:14 +00:00
+								            lexeme = self.vocab[tokens.c[i].lex.orth]
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								            lexeme.set_flag(tag, True)
 								            self._phrase_key[i] = lexeme.orth
 								        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
 								        self.phrase_ids[key] = True
 								    def __call__(self, Doc doc):
 								        matches = []
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
+								        for ent_id, label, start, end in self.matcher(doc):
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								            cand = doc[start : end]
 								            start = cand[0].idx
 								            end = cand[-1].idx + len(cand[-1])
 								            matches.append((start, end, cand.root.tag_, cand.text, 'MWE'))
 								        for match in matches:
 								            doc.merge(*match)
 								        return matches
-												* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading.

											
										
										
											2016-02-03 01:04:55 +00:00
+								    def pipe(self, stream, batch_size=1000, n_threads=2):
 								        for doc in stream:
 								            self(doc)
 								            yield doc
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
+								    def accept_match(self, Doc doc, int ent_id, int label, int start, int end):
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								        assert (end - start) < self.max_length
 								        cdef int i, j
 								        for i in range(self.max_length):
 								            self._phrase_key[i] = 0
 								        for i, j in enumerate(range(start, end)):
-												* Rename Doc.data to Doc.c

											
										
										
											2015-11-03 13:15:14 +00:00
+								            self._phrase_key[i] = doc.c[j].lex.orth
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								        cdef hash_t key = hash64(self._phrase_key, self.max_length * sizeof(attr_t), 0)
 								        if self.phrase_ids.get(key):
-												Fix PhraseMatcher to work with updated Matcher

#613

											
										
										
											2016-11-08 16:14:26 +00:00
+								            return (ent_id, label, start, end)
-												* Fix phrase matcher

											
										
										
											2015-10-08 15:00:45 +00:00
+								        else:
 								            return False