mirror of https://github.com/explosion/spaCy.git
182 lines
5.3 KiB
Plaintext
182 lines
5.3 KiB
Plaintext
//- 💫 DOCS > API > PHRASEMATCHER
|
|
|
|
include ../_includes/_mixins
|
|
|
|
p
|
|
| The #[code PhraseMatcher] lets you efficiently match large terminology
|
|
| lists. While the #[+api("matcher") #[code Matcher]] lets you match
|
|
| squences based on lists of token descriptions, the #[code PhraseMatcher]
|
|
| accepts match patterns in the form of #[code Doc] objects.
|
|
|
|
+h(2, "init") PhraseMatcher.__init__
|
|
+tag method
|
|
|
|
p Create the rule-based #[code PhraseMatcher].
|
|
|
|
+aside-code("Example").
|
|
from spacy.matcher import PhraseMatcher
|
|
matcher = PhraseMatcher(nlp.vocab, max_length=6)
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code vocab]
|
|
+cell #[code Vocab]
|
|
+cell
|
|
| The vocabulary object, which must be shared with the documents
|
|
| the matcher will operate on.
|
|
|
|
+row
|
|
+cell #[code max_length]
|
|
+cell int
|
|
+cell Maximum length of a phrase pattern to add.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell #[code PhraseMatcher]
|
|
+cell The newly constructed object.
|
|
|
|
+h(2, "call") PhraseMatcher.__call__
|
|
+tag method
|
|
|
|
p Find all token sequences matching the supplied patterns on the #[code Doc].
|
|
|
|
+aside-code("Example").
|
|
from spacy.matcher import PhraseMatcher
|
|
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
matcher.add('OBAMA', None, nlp(u"Barack Obama"))
|
|
doc = nlp(u"Barack Obama lifts America one last time in emotional farewell")
|
|
matches = matcher(doc)
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code doc]
|
|
+cell #[code Doc]
|
|
+cell The document to match over.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell list
|
|
+cell
|
|
| A list of #[code (match_id, start, end)] tuples, describing the
|
|
| matches. A match tuple describes a span #[code doc[start:end]].
|
|
| The #[code match_id] is the ID of the added match pattern.
|
|
|
|
+h(2, "pipe") PhraseMatcher.pipe
|
|
+tag method
|
|
|
|
p Match a stream of documents, yielding them in turn.
|
|
|
|
+aside-code("Example").
|
|
from spacy.matcher import PhraseMatcher
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
|
|
pass
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code docs]
|
|
+cell iterable
|
|
+cell A stream of documents.
|
|
|
|
+row
|
|
+cell #[code batch_size]
|
|
+cell int
|
|
+cell The number of documents to accumulate into a working set.
|
|
|
|
+row
|
|
+cell #[code n_threads]
|
|
+cell int
|
|
+cell
|
|
| The number of threads with which to work on the buffer in
|
|
| parallel, if the #[code PhraseMatcher] implementation supports
|
|
| multi-threading.
|
|
|
|
+row("foot")
|
|
+cell yields
|
|
+cell #[code Doc]
|
|
+cell Documents, in order.
|
|
|
|
+h(2, "len") PhraseMatcher.__len__
|
|
+tag method
|
|
|
|
p
|
|
| Get the number of rules added to the matcher. Note that this only returns
|
|
| the number of rules (identical with the number of IDs), not the number
|
|
| of individual patterns.
|
|
|
|
+aside-code("Example").
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
assert len(matcher) == 0
|
|
matcher.add('OBAMA', None, nlp(u"Barack Obama"))
|
|
assert len(matcher) == 1
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row("foot")
|
|
+cell returns
|
|
+cell int
|
|
+cell The number of rules.
|
|
|
|
+h(2, "contains") PhraseMatcher.__contains__
|
|
+tag method
|
|
|
|
p Check whether the matcher contains rules for a match ID.
|
|
|
|
+aside-code("Example").
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
assert 'OBAMA' not in matcher
|
|
matcher.add('OBAMA', None, nlp(u"Barack Obama"))
|
|
assert 'OBAMA' in matcher
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code key]
|
|
+cell unicode
|
|
+cell The match ID.
|
|
|
|
+row("foot")
|
|
+cell returns
|
|
+cell int
|
|
+cell Whether the matcher contains rules for this match ID.
|
|
|
|
+h(2, "add") PhraseMatcher.add
|
|
+tag method
|
|
|
|
p
|
|
| Add a rule to the matcher, consisting of an ID key, one or more patterns, and
|
|
| a callback function to act on the matches. The callback function will
|
|
| receive the arguments #[code matcher], #[code doc], #[code i] and
|
|
| #[code matches]. If a pattern already exists for the given ID, the
|
|
| patterns will be extended. An #[code on_match] callback will be
|
|
| overwritten.
|
|
|
|
+aside-code("Example").
|
|
def on_match(matcher, doc, id, matches):
|
|
print('Matched!', matches)
|
|
|
|
matcher = PhraseMatcher(nlp.vocab)
|
|
matcher.add('OBAMA', on_match, nlp(u"Barack Obama"))
|
|
matcher.add('HEALTH', on_match, nlp(u"health care reform"),
|
|
nlp(u"healthcare reform"))
|
|
doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms")
|
|
matches = matcher(doc)
|
|
|
|
+table(["Name", "Type", "Description"])
|
|
+row
|
|
+cell #[code match_id]
|
|
+cell unicode
|
|
+cell An ID for the thing you're matching.
|
|
|
|
+row
|
|
+cell #[code on_match]
|
|
+cell callable or #[code None]
|
|
+cell
|
|
| Callback function to act on matches. Takes the arguments
|
|
| #[code matcher], #[code doc], #[code i] and #[code matches].
|
|
|
|
+row
|
|
+cell #[code *docs]
|
|
+cell list
|
|
+cell
|
|
| #[code Doc] objects of the phrases to match.
|