spaCy/website/docs/api/matcher.jade

//- 💫 DOCS > API > MATCHER

include ../../_includes/_mixins

p Match sequences of tokens, based on pattern rules.

+infobox("⚠️ Deprecation note")
    |  As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
    |  are deprecated and have been replaced with a simpler
    |  #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
    |  patterns and a callback for a given match ID. #[code Matcher.get_entity]
    |  is now called #[+api("matcher#get") #[code matcher.get]].
    |  #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
    |  and #[code Matcher.has_entity] (now redundant) have been removed.

+h(2, "init") Matcher.__init__
    +tag method

p Create the rule-based #[code Matcher].

+aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER

    patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
    matcher = Matcher(nlp.vocab)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell
            |  The vocabulary object, which must be shared with the documents
            |  the matcher will operate on.

    +row
        +cell #[code patterns]
        +cell dict
        +cell Patterns to add to the matcher, keyed by ID.

    +footrow
        +cell returns
        +cell #[code Matcher]
        +cell The newly constructed object.

+h(2, "call") Matcher.__call__
    +tag method

p Find all token sequences matching the supplied patterns on the #[code Doc].

+aside-code("Example").
    from spacy.matcher import Matcher
    from spacy.attrs import LOWER

    matcher = Matcher(nlp.vocab)
    pattern = [{LOWER: "hello"}, {LOWER: "world"}]
    matcher.add("HelloWorld", on_match=None, pattern)
    doc = nlp(u'hello world!')
    matches = matcher(doc)

+infobox("Important note")
    |  By default, the matcher #[strong does not perform any action] on matches,
    |  like tagging matched phrases with entity types. Instead, actions need to
    |  be specified when #[strong adding patterns or entities], by
    |  passing in a callback function as the #[code on_match] argument on
    |  #[+api("matcher#add") #[code add]]. This allows you to define custom
    |  actions per pattern within the same matcher. For example, you might only
    |  want to merge some entity types, and set custom flags for other matched
    |  patterns. For more details and examples, see the usage workflow on
    |  #[+a("/docs/usage/rule-based-matching") rule-based matching].

+table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The document to match over.

    +footrow
        +cell returns
        +cell list
        +cell
            |  A list of #[code (match_id, start, end)] tuples, describing the
            |  matches. A match tuple describes a span #[code doc[start:end]].
            |  The #[code match_id] is the ID of the added match pattern.

+h(2, "pipe") Matcher.pipe
    +tag method

p Match a stream of documents, yielding them in turn.

+aside-code("Example").
    from spacy.matcher import Matcher
    matcher = Matcher(nlp.vocab)
    for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
        pass

+table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
        +cell iterable
        +cell A stream of documents.

    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of documents to accumulate into a working set.

    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads with which to work on the buffer in
            |  parallel, if the #[code Matcher] implementation supports
            |  multi-threading.

    +footrow
        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

+h(2, "len") Matcher.__len__
    +tag method

p
    |  Get the number of rules added to the matcher. Note that this only returns
    |  the number of rules (identical with the number of IDs), not the number
    |  of individual patterns.

+aside-code("Example").
    matcher = Matcher(nlp.vocab)
    assert len(matcher) == 0
    matcher.add('Rule', None, [{ORTH: 'test'}])
    assert len(matcher) == 1

+table(["Name", "Type", "Description"])
    +footrow
        +cell returns
        +cell int
        +cell The number of rules.

+h(2, "contains") Matcher.__contains__
    +tag method

p Check whether the matcher contains rules for a match ID.

+aside-code("Example").
    matcher = Matcher(nlp.vocab)
    assert 'Rule' in matcher == False
    matcher.add('Rule', None, [{ORTH: 'test'}])
    assert 'Rule' in matcher == True

+table(["Name", "Type", "Description"])
    +row
        +cell #[code key]
        +cell unicode
        +cell The match ID.
    +footrow
        +cell returns
        +cell int
        +cell Whether the matcher contains rules for this match ID.

+h(2, "add") Matcher.add
    +tag method

p
    |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
    |  a callback function to act on the matches. The callback function will
    |  receive the arguments #[code matcher], #[code doc], #[code i] and
    |  #[code matches]. If a pattern already exists for the given ID, the
    |  patterns will be extended. An #[code on_match] callback will be
    |  overwritten.

+aside-code("Example").
    def on_match(matcher, doc, id, matches):
        print('Matched!', matches)

    matcher = Matcher(nlp.vocab)
    matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])
    matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])
    doc = nlp(u'HELLO WORLD on Google Maps.')
    matches = matcher(doc)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code match_id]
        +cell unicode
        +cell An ID for the thing you're matching.

    +row
        +cell #[code on_match]
        +cell function or #[code None]
        +cell
            |  Callback function to act on matches. Takes the arguments
            |  #[code matcher], #[code doc], #[code i] and #[code matches].

    +row
        +cell #[code *patterns]
        +cell list
        +cell
            |  Match pattern. A pattern consists of a list of dicts, where each
            |  dict describes a token.

+h(2, "remove") Matcher.remove
    +tag method

p
    |  Remove a rule from the matcher. A #[code KeyError] is raised if the match
    |  ID does not exist.

+aside-code("Example").
    matcher.add('Rule', None, [{ORTH: 'test'}])
    assert 'Rule' in matcher == True
    matcher.remove('Rule')
    assert 'Rule' in matcher == False

+table(["Name", "Type", "Description"])
    +row
        +cell #[code key]
        +cell unicode
        +cell The ID of the match rule.

+h(2, "get") Matcher.get
    +tag method

p
    |  Retrieve the pattern stored for a key. Returns the rule as an
    |  #[code (on_match, patterns)] tuple containing the callback and available
    |  patterns.

+aside-code("Example").
    pattern = [{ORTH: 'test'}]
    matcher.add('Rule', None, pattern)
    (on_match, patterns) = matcher.get('Rule')
    assert patterns = [pattern]

+table(["Name", "Type", "Description"])
    +row
        +cell #[code key]
        +cell unicode
        +cell The ID of the match rule.

    +footrow
        +cell returns
        +cell tuple
        +cell The rule, as an #[code (on_match, patterns)] tuple.
Update to new website 2016-10-31 18:04:15 +00:00			`//- 💫 DOCS > API > MATCHER`

			`include ../../_includes/_mixins`

			`p Match sequences of tokens, based on pattern rules.`

Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+infobox("⚠️ Deprecation note")`
			`\| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]`
			`\| are deprecated and have been replaced with a simpler`
			`\| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of`
Update docstrings and API docs for Matcher 2017-05-20 12:43:10 +00:00			`\| patterns and a callback for a given match ID. #[code Matcher.get_entity]`
			`\| is now called #[+api("matcher#get") #[code matcher.get]].`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`\| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),`
Update docstrings and API docs for Matcher 2017-05-20 12:43:10 +00:00			`\| and #[code Matcher.has_entity] (now redundant) have been removed.`
Update to new website 2016-10-31 18:04:15 +00:00
			`+h(2, "init") Matcher.__init__`
			`+tag method`

Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`p Create the rule-based #[code Matcher].`

			`+aside-code("Example").`
			`from spacy.matcher import Matcher`
			`from spacy.attrs import LOWER`

			`patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}`
			`matcher = Matcher(nlp.vocab)`
Update to new website 2016-10-31 18:04:15 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code vocab]`
			`+cell #[code Vocab]`
			`+cell`
			`\| The vocabulary object, which must be shared with the documents`
			`\| the matcher will operate on.`

			`+row`
			`+cell #[code patterns]`
			`+cell dict`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+cell Patterns to add to the matcher, keyed by ID.`
Update to new website 2016-10-31 18:04:15 +00:00
			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Update to new website 2016-10-31 18:04:15 +00:00			`+cell #[code Matcher]`
			`+cell The newly constructed object.`

			`+h(2, "call") Matcher.__call__`
			`+tag method`

Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`p Find all token sequences matching the supplied patterns on the #[code Doc].`

			`+aside-code("Example").`
			`from spacy.matcher import Matcher`
			`from spacy.attrs import LOWER`

			`matcher = Matcher(nlp.vocab)`
			`pattern = [{LOWER: "hello"}, {LOWER: "world"}]`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00			`matcher.add("HelloWorld", on_match=None, pattern)`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`doc = nlp(u'hello world!')`
			`matches = matcher(doc)`

			`+infobox("Important note")`
			`\| By default, the matcher #[strong does not perform any action] on matches,`
			`\| like tagging matched phrases with entity types. Instead, actions need to`
			`\| be specified when #[strong adding patterns or entities], by`
			`\| passing in a callback function as the #[code on_match] argument on`
			`\| #[+api("matcher#add") #[code add]]. This allows you to define custom`
			`\| actions per pattern within the same matcher. For example, you might only`
			`\| want to merge some entity types, and set custom flags for other matched`
			`\| patterns. For more details and examples, see the usage workflow on`
			`\| #[+a("/docs/usage/rule-based-matching") rule-based matching].`
Update to new website 2016-10-31 18:04:15 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code doc]`
			`+cell #[code Doc]`
			`+cell The document to match over.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell returns`
Update to new website 2016-10-31 18:04:15 +00:00			`+cell list`
			`+cell`
Update Matcher API docs 2017-05-19 23:38:34 +00:00			`\| A list of #[code (match_id, start, end)] tuples, describing the`
			`\| matches. A match tuple describes a span #[code doc[start:end]].`
			`\| The #[code match_id] is the ID of the added match pattern.`
Update to new website 2016-10-31 18:04:15 +00:00
			`+h(2, "pipe") Matcher.pipe`
			`+tag method`

			`p Match a stream of documents, yielding them in turn.`

Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00			`+aside-code("Example").`
			`from spacy.matcher import Matcher`
			`matcher = Matcher(nlp.vocab)`
			`for doc in matcher.pipe(texts, batch_size=50, n_threads=4):`
			`pass`

Update to new website 2016-10-31 18:04:15 +00:00			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code docs]`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+cell iterable`
Update to new website 2016-10-31 18:04:15 +00:00			`+cell A stream of documents.`

			`+row`
			`+cell #[code batch_size]`
			`+cell int`
			`+cell The number of documents to accumulate into a working set.`

			`+row`
			`+cell #[code n_threads]`
			`+cell int`
			`+cell`
			`\| The number of threads with which to work on the buffer in`
			`\| parallel, if the #[code Matcher] implementation supports`
			`\| multi-threading.`

			`+footrow`
Use returns/yields instead of return/yield 2017-05-18 22:02:34 +00:00			`+cell yields`
Update to new website 2016-10-31 18:04:15 +00:00			`+cell #[code Doc]`
			`+cell Documents, in order.`

Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00			`+h(2, "len") Matcher.__len__`
			`+tag method`

Update docstrings and API docs for Matcher 2017-05-20 12:32:34 +00:00			`p`
			`\| Get the number of rules added to the matcher. Note that this only returns`
			`\| the number of rules (identical with the number of IDs), not the number`
			`\| of individual patterns.`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00
			`+aside-code("Example").`
			`matcher = Matcher(nlp.vocab)`
			`assert len(matcher) == 0`
Update docstrings and API docs for Matcher 2017-05-20 12:32:34 +00:00			`matcher.add('Rule', None, [{ORTH: 'test'}])`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00			`assert len(matcher) == 1`

			`+table(["Name", "Type", "Description"])`
			`+footrow`
			`+cell returns`
			`+cell int`
			`+cell The number of rules.`

			`+h(2, "contains") Matcher.__contains__`
			`+tag method`

			`p Check whether the matcher contains rules for a match ID.`

			`+aside-code("Example").`
			`matcher = Matcher(nlp.vocab)`
Update docstrings and API docs for Matcher 2017-05-20 12:32:34 +00:00			`assert 'Rule' in matcher == False`
			`matcher.add('Rule', None, [{ORTH: 'test'}])`
			`assert 'Rule' in matcher == True`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code key]`
			`+cell unicode`
			`+cell The match ID.`
			`+footrow`
			`+cell returns`
			`+cell int`
			`+cell Whether the matcher contains rules for this match ID.`

			`+h(2, "add") Matcher.add`
Update to new website 2016-10-31 18:04:15 +00:00			`+tag method`

Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`p`
Update Matcher API 2017-05-20 11:54:53 +00:00			`\| Add a rule to the matcher, consisting of an ID key, one or more patterns, and`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00			`\| a callback function to act on the matches. The callback function will`
			`\| receive the arguments #[code matcher], #[code doc], #[code i] and`
			`\| #[code matches]. If a pattern already exists for the given ID, the`
			`\| patterns will be extended. An #[code on_match] callback will be`
			`\| overwritten.`
Update to new website 2016-10-31 18:04:15 +00:00
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+aside-code("Example").`
			`def on_match(matcher, doc, id, matches):`
			`print('Matched!', matches)`
Update to new website 2016-10-31 18:04:15 +00:00
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`matcher = Matcher(nlp.vocab)`
Update Matcher API and workflow to reflect new API on_match is now the second positional argument, to easily allow a variable number of patterns while keeping the method clean and readable. 2017-05-20 10:59:03 +00:00			`matcher.add('HelloWorld', on_match, [{LOWER: "hello"}, {LOWER: "world"}])`
			`matcher.add('GoogleMaps', on_match, [{ORTH: "Google"}, {ORTH: "Maps"}])`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`doc = nlp(u'HELLO WORLD on Google Maps.')`
			`matches = matcher(doc)`
Update to new website 2016-10-31 18:04:15 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+cell #[code match_id]`
			`+cell unicode`
			`+cell An ID for the thing you're matching.`
Update to new website 2016-10-31 18:04:15 +00:00
			`+row`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+cell #[code on_match]`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00			`+cell function or #[code None]`
Update Matcher docstrings and API docs 2017-05-19 19:47:06 +00:00			`+cell`
			`\| Callback function to act on matches. Takes the arguments`
Update Matcher API docs 2017-05-19 23:38:34 +00:00			`\| #[code matcher], #[code doc], #[code i] and #[code matches].`
Update Matcher API and workflow to reflect new API on_match is now the second positional argument, to easily allow a variable number of patterns while keeping the method clean and readable. 2017-05-20 10:59:03 +00:00
			`+row`
			`+cell #[code *patterns]`
			`+cell list`
			`+cell`
			`\| Match pattern. A pattern consists of a list of dicts, where each`
			`\| dict describes a token.`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00
			`+h(2, "remove") Matcher.remove`
			`+tag method`

			`p`
			`\| Remove a rule from the matcher. A #[code KeyError] is raised if the match`
			`\| ID does not exist.`

			`+aside-code("Example").`
Update docstrings and API docs for Matcher 2017-05-20 12:32:34 +00:00			`matcher.add('Rule', None, [{ORTH: 'test'}])`
			`assert 'Rule' in matcher == True`
			`matcher.remove('Rule')`
			`assert 'Rule' in matcher == False`
Update docstrings and API docs for Matcher 2017-05-20 12:26:10 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code key]`
			`+cell unicode`
			`+cell The ID of the match rule.`
Update docstrings and API docs for Matcher 2017-05-20 12:43:10 +00:00
			`+h(2, "get") Matcher.get`
			`+tag method`

			`p`
			`\| Retrieve the pattern stored for a key. Returns the rule as an`
			`\| #[code (on_match, patterns)] tuple containing the callback and available`
			`\| patterns.`

			`+aside-code("Example").`
			`pattern = [{ORTH: 'test'}]`
			`matcher.add('Rule', None, pattern)`
			`(on_match, patterns) = matcher.get('Rule')`
			`assert patterns = [pattern]`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code key]`
			`+cell unicode`
			`+cell The ID of the match rule.`

			`+footrow`
			`+cell returns`
			`+cell tuple`
			`+cell The rule, as an #[code (on_match, patterns)] tuple.`