2016-10-31 18:04:15 +00:00
|
|
|
//- 💫 DOCS > API > MATCHER
|
|
|
|
|
|
|
|
include ../../_includes/_mixins
|
|
|
|
|
|
|
|
p Match sequences of tokens, based on pattern rules.
|
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
+infobox("⚠️ Deprecation note")
|
|
|
|
| As of spaCy 2.0, #[code Matcher.add_pattern] and #[code Matcher.add_entity]
|
|
|
|
| are deprecated and have been replaced with a simpler
|
|
|
|
| #[+api("matcher#add") #[code Matcher.add]] that lets you add a list of
|
|
|
|
| patterns and a callback for a given match ID.
|
|
|
|
| #[code Matcher.load] (not useful, as it didn't allow specifying callbacks),
|
|
|
|
| #[code Matcher.has_entity] and #[code Matcher.get_entity] (now redundant)
|
|
|
|
| have been removed.
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+h(2, "init") Matcher.__init__
|
|
|
|
+tag method
|
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
p Create the rule-based #[code Matcher].
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
from spacy.attrs import LOWER
|
|
|
|
|
|
|
|
patterns = {"HelloWorld": [{LOWER: "hello"}, {LOWER: "world"}]}
|
|
|
|
matcher = Matcher(nlp.vocab)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code vocab]
|
|
|
|
+cell #[code Vocab]
|
|
|
|
+cell
|
|
|
|
| The vocabulary object, which must be shared with the documents
|
|
|
|
| the matcher will operate on.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code patterns]
|
|
|
|
+cell dict
|
2017-05-19 19:47:06 +00:00
|
|
|
+cell Patterns to add to the matcher, keyed by ID.
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2016-10-31 18:04:15 +00:00
|
|
|
+cell #[code Matcher]
|
|
|
|
+cell The newly constructed object.
|
|
|
|
|
|
|
|
+h(2, "call") Matcher.__call__
|
|
|
|
+tag method
|
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|
|
|
|
|
|
|
+aside-code("Example").
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
from spacy.attrs import LOWER
|
|
|
|
|
|
|
|
matcher = Matcher(nlp.vocab)
|
|
|
|
pattern = [{LOWER: "hello"}, {LOWER: "world"}]
|
|
|
|
matcher.add_pattern("HelloWorld", pattern, on_match=None)
|
|
|
|
doc = nlp(u'hello world!')
|
|
|
|
matches = matcher(doc)
|
|
|
|
|
|
|
|
+infobox("Important note")
|
|
|
|
| By default, the matcher #[strong does not perform any action] on matches,
|
|
|
|
| like tagging matched phrases with entity types. Instead, actions need to
|
|
|
|
| be specified when #[strong adding patterns or entities], by
|
|
|
|
| passing in a callback function as the #[code on_match] argument on
|
|
|
|
| #[+api("matcher#add") #[code add]]. This allows you to define custom
|
|
|
|
| actions per pattern within the same matcher. For example, you might only
|
|
|
|
| want to merge some entity types, and set custom flags for other matched
|
|
|
|
| patterns. For more details and examples, see the usage workflow on
|
|
|
|
| #[+a("/docs/usage/rule-based-matching") rule-based matching].
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code doc]
|
|
|
|
+cell #[code Doc]
|
|
|
|
+cell The document to match over.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell returns
|
2016-10-31 18:04:15 +00:00
|
|
|
+cell list
|
|
|
|
+cell
|
|
|
|
| A list of#[code (entity_key, label_id, start, end)] tuples,
|
|
|
|
| describing the matches. A match tuple describes a
|
|
|
|
| #[code span doc[start:end]]. The #[code label_id] and
|
|
|
|
| #[code entity_key] are both integers.
|
|
|
|
|
|
|
|
+h(2, "pipe") Matcher.pipe
|
|
|
|
+tag method
|
|
|
|
|
|
|
|
p Match a stream of documents, yielding them in turn.
|
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
|
|
|
+cell #[code docs]
|
2017-05-19 19:47:06 +00:00
|
|
|
+cell iterable
|
2016-10-31 18:04:15 +00:00
|
|
|
+cell A stream of documents.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code batch_size]
|
|
|
|
+cell int
|
|
|
|
+cell The number of documents to accumulate into a working set.
|
|
|
|
|
|
|
|
+row
|
|
|
|
+cell #[code n_threads]
|
|
|
|
+cell int
|
|
|
|
+cell
|
|
|
|
| The number of threads with which to work on the buffer in
|
|
|
|
| parallel, if the #[code Matcher] implementation supports
|
|
|
|
| multi-threading.
|
|
|
|
|
|
|
|
+footrow
|
2017-05-18 22:02:34 +00:00
|
|
|
+cell yields
|
2016-10-31 18:04:15 +00:00
|
|
|
+cell #[code Doc]
|
|
|
|
+cell Documents, in order.
|
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
+h(2, "add_pattern") Matcher.add
|
2016-10-31 18:04:15 +00:00
|
|
|
+tag method
|
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
p
|
|
|
|
| Add one or more patterns to the matcher, along with a callback function
|
|
|
|
| to handle the matches. The callback function will receive the arguments
|
|
|
|
| #[code matcher], #[code doc], #[code id] and #[code matches].
|
2016-10-31 18:04:15 +00:00
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
+aside-code("Example").
|
|
|
|
from spacy.matcher import Matcher
|
|
|
|
from spacy.attrs import LOWER, ORTH
|
2016-10-31 18:04:15 +00:00
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
def on_match(matcher, doc, id, matches):
|
|
|
|
print('Matched!', matches)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
matcher = Matcher(nlp.vocab)
|
|
|
|
matcher.add('HelloWorld', [{LOWER: "hello"}, {LOWER: "world"}], on_match=on_match)
|
|
|
|
matcher.add('GoogleMaps', [{ORTH: "Google"}, {ORTH: "Maps"}], on_match=on_match)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
2017-05-19 19:47:06 +00:00
|
|
|
doc = nlp(u'HELLO WORLD on Google Maps.')
|
|
|
|
matches = matcher(doc)
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
|
|
+row
|
2017-05-19 19:47:06 +00:00
|
|
|
+cell #[code match_id]
|
|
|
|
+cell unicode
|
|
|
|
+cell An ID for the thing you're matching.
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+row
|
2017-05-19 19:47:06 +00:00
|
|
|
+cell #[code *patterns]
|
|
|
|
+cell list
|
|
|
|
+cell
|
|
|
|
| Match pattern. A pattern consists of a list of dicts, where each
|
|
|
|
| dict describes a token.
|
2016-10-31 18:04:15 +00:00
|
|
|
|
|
|
|
+row
|
2017-05-19 19:47:06 +00:00
|
|
|
+cell #[code on_match]
|
|
|
|
+cell function
|
|
|
|
+cell
|
|
|
|
| Callback function to act on matches. Takes the arguments
|
|
|
|
| #[code matcher], #[code doc], #[code id] and #[code matches].
|