spaCy/website/api/phrasematcher.jade

//- 💫 DOCS > API > PHRASEMATCHER

include ../_includes/_mixins

p
    |  The #[code PhraseMatcher] lets you efficiently match large terminology
    |  lists. While the #[+api("matcher") #[code Matcher]] lets you match
    |  squences based on lists of token descriptions, the #[code PhraseMatcher]
    |  accepts match patterns in the form of #[code Doc] objects.

+h(2, "init") PhraseMatcher.__init__
    +tag method

p Create the rule-based #[code PhraseMatcher].

+aside-code("Example").
    from spacy.matcher import PhraseMatcher
    matcher = PhraseMatcher(nlp.vocab, max_length=6)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code vocab]
        +cell #[code Vocab]
        +cell
            |  The vocabulary object, which must be shared with the documents
            |  the matcher will operate on.

    +row
        +cell #[code max_length]
        +cell int
        +cell Maximum length of a phrase pattern to add.

    +row("foot")
        +cell returns
        +cell #[code PhraseMatcher]
        +cell The newly constructed object.

+h(2, "call") PhraseMatcher.__call__
    +tag method

p Find all token sequences matching the supplied patterns on the #[code Doc].

+aside-code("Example").
    from spacy.matcher import PhraseMatcher

    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('OBAMA', None, nlp(u"Barack Obama"))
    doc = nlp(u"Barack Obama lifts America one last time in emotional farewell")
    matches = matcher(doc)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code doc]
        +cell #[code Doc]
        +cell The document to match over.

    +row("foot")
        +cell returns
        +cell list
        +cell
            |  A list of #[code (match_id, start, end)] tuples, describing the
            |  matches. A match tuple describes a span #[code doc[start:end]].
            |  The #[code match_id] is the ID of the added match pattern.

+h(2, "pipe") PhraseMatcher.pipe
    +tag method

p Match a stream of documents, yielding them in turn.

+aside-code("Example").
    from spacy.matcher import PhraseMatcher
    matcher = PhraseMatcher(nlp.vocab)
    for doc in matcher.pipe(texts, batch_size=50, n_threads=4):
        pass

+table(["Name", "Type", "Description"])
    +row
        +cell #[code docs]
        +cell iterable
        +cell A stream of documents.

    +row
        +cell #[code batch_size]
        +cell int
        +cell The number of documents to accumulate into a working set.

    +row
        +cell #[code n_threads]
        +cell int
        +cell
            |  The number of threads with which to work on the buffer in
            |  parallel, if the #[code PhraseMatcher] implementation supports
            |  multi-threading.

    +row("foot")
        +cell yields
        +cell #[code Doc]
        +cell Documents, in order.

+h(2, "len") PhraseMatcher.__len__
    +tag method

p
    |  Get the number of rules added to the matcher. Note that this only returns
    |  the number of rules (identical with the number of IDs), not the number
    |  of individual patterns.

+aside-code("Example").
    matcher = PhraseMatcher(nlp.vocab)
    assert len(matcher) == 0
    matcher.add('OBAMA', None, nlp(u"Barack Obama"))
    assert len(matcher) == 1

+table(["Name", "Type", "Description"])
    +row("foot")
        +cell returns
        +cell int
        +cell The number of rules.

+h(2, "contains") PhraseMatcher.__contains__
    +tag method

p Check whether the matcher contains rules for a match ID.

+aside-code("Example").
    matcher = PhraseMatcher(nlp.vocab)
    assert 'OBAMA' not in matcher
    matcher.add('OBAMA', None, nlp(u"Barack Obama"))
    assert 'OBAMA' in matcher

+table(["Name", "Type", "Description"])
    +row
        +cell #[code key]
        +cell unicode
        +cell The match ID.

    +row("foot")
        +cell returns
        +cell int
        +cell Whether the matcher contains rules for this match ID.

+h(2, "add") PhraseMatcher.add
    +tag method

p
    |  Add a rule to the matcher, consisting of an ID key, one or more patterns, and
    |  a callback function to act on the matches. The callback function will
    |  receive the arguments #[code matcher], #[code doc], #[code i] and
    |  #[code matches]. If a pattern already exists for the given ID, the
    |  patterns will be extended. An #[code on_match] callback will be
    |  overwritten.

+aside-code("Example").
    def on_match(matcher, doc, id, matches):
        print('Matched!', matches)

    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('OBAMA', on_match, nlp(u"Barack Obama"))
    matcher.add('HEALTH', on_match, nlp(u"health care reform"),
                                    nlp(u"healthcare reform"))
    doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms")
    matches = matcher(doc)

+table(["Name", "Type", "Description"])
    +row
        +cell #[code match_id]
        +cell unicode
        +cell An ID for the thing you're matching.

    +row
        +cell #[code on_match]
        +cell callable or #[code None]
        +cell
            |  Callback function to act on matches. Takes the arguments
            |  #[code matcher], #[code doc], #[code i] and #[code matches].

    +row
        +cell #[code *docs]
        +cell list
        +cell
            |  #[code Doc] objects of the phrases to match.
Update API documentation 2017-10-03 12:27:22 +00:00			`//- 💫 DOCS > API > PHRASEMATCHER`

			`include ../_includes/_mixins`

			`p`
			`\| The #[code PhraseMatcher] lets you efficiently match large terminology`
			`\| lists. While the #[+api("matcher") #[code Matcher]] lets you match`
			`\| squences based on lists of token descriptions, the #[code PhraseMatcher]`
			`\| accepts match patterns in the form of #[code Doc] objects.`

			`+h(2, "init") PhraseMatcher.__init__`
			`+tag method`

			`p Create the rule-based #[code PhraseMatcher].`

			`+aside-code("Example").`
			`from spacy.matcher import PhraseMatcher`
Fix typos in PhraseMatcher docs 2017-10-04 14:12:06 +00:00			`matcher = PhraseMatcher(nlp.vocab, max_length=6)`
Update API documentation 2017-10-03 12:27:22 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code vocab]`
			`+cell #[code Vocab]`
			`+cell`
			`\| The vocabulary object, which must be shared with the documents`
			`\| the matcher will operate on.`

			`+row`
			`+cell #[code max_length]`
			`+cell int`
Fixes typo in PhraseMatcher API docs 2018-01-11 16:35:59 +00:00			`+cell Maximum length of a phrase pattern to add.`
Update API documentation 2017-10-03 12:27:22 +00:00
			`+row("foot")`
			`+cell returns`
			`+cell #[code PhraseMatcher]`
			`+cell The newly constructed object.`

			`+h(2, "call") PhraseMatcher.__call__`
			`+tag method`

			`p Find all token sequences matching the supplied patterns on the #[code Doc].`

			`+aside-code("Example").`
Fix typos in PhraseMatcher docs 2017-10-04 14:12:06 +00:00			`from spacy.matcher import PhraseMatcher`
Update API documentation 2017-10-03 12:27:22 +00:00
Fix typos in PhraseMatcher docs 2017-10-04 14:12:06 +00:00			`matcher = PhraseMatcher(nlp.vocab)`
Update API documentation 2017-10-03 12:27:22 +00:00			`matcher.add('OBAMA', None, nlp(u"Barack Obama"))`
			`doc = nlp(u"Barack Obama lifts America one last time in emotional farewell")`
			`matches = matcher(doc)`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code doc]`
			`+cell #[code Doc]`
			`+cell The document to match over.`

			`+row("foot")`
			`+cell returns`
			`+cell list`
			`+cell`
			`\| A list of #[code (match_id, start, end)] tuples, describing the`
			`\| matches. A match tuple describes a span #[code doc[start:end]].`
			`\| The #[code match_id] is the ID of the added match pattern.`

			`+h(2, "pipe") PhraseMatcher.pipe`
			`+tag method`

			`p Match a stream of documents, yielding them in turn.`

			`+aside-code("Example").`
			`from spacy.matcher import PhraseMatcher`
			`matcher = PhraseMatcher(nlp.vocab)`
			`for doc in matcher.pipe(texts, batch_size=50, n_threads=4):`
			`pass`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code docs]`
			`+cell iterable`
			`+cell A stream of documents.`

			`+row`
			`+cell #[code batch_size]`
			`+cell int`
			`+cell The number of documents to accumulate into a working set.`

			`+row`
			`+cell #[code n_threads]`
			`+cell int`
			`+cell`
			`\| The number of threads with which to work on the buffer in`
			`\| parallel, if the #[code PhraseMatcher] implementation supports`
			`\| multi-threading.`

			`+row("foot")`
			`+cell yields`
			`+cell #[code Doc]`
			`+cell Documents, in order.`

			`+h(2, "len") PhraseMatcher.__len__`
			`+tag method`

			`p`
			`\| Get the number of rules added to the matcher. Note that this only returns`
			`\| the number of rules (identical with the number of IDs), not the number`
			`\| of individual patterns.`

			`+aside-code("Example").`
			`matcher = PhraseMatcher(nlp.vocab)`
			`assert len(matcher) == 0`
			`matcher.add('OBAMA', None, nlp(u"Barack Obama"))`
			`assert len(matcher) == 1`

			`+table(["Name", "Type", "Description"])`
			`+row("foot")`
			`+cell returns`
			`+cell int`
			`+cell The number of rules.`

			`+h(2, "contains") PhraseMatcher.__contains__`
			`+tag method`

			`p Check whether the matcher contains rules for a match ID.`

			`+aside-code("Example").`
			`matcher = PhraseMatcher(nlp.vocab)`
Fix PhraseMatcher example 2017-10-06 16:22:10 +00:00			`assert 'OBAMA' not in matcher`
Update API documentation 2017-10-03 12:27:22 +00:00			`matcher.add('OBAMA', None, nlp(u"Barack Obama"))`
Fix PhraseMatcher example 2017-10-06 16:22:10 +00:00			`assert 'OBAMA' in matcher`
Update API documentation 2017-10-03 12:27:22 +00:00
			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code key]`
			`+cell unicode`
			`+cell The match ID.`

			`+row("foot")`
			`+cell returns`
			`+cell int`
			`+cell Whether the matcher contains rules for this match ID.`

			`+h(2, "add") PhraseMatcher.add`
			`+tag method`

			`p`
			`\| Add a rule to the matcher, consisting of an ID key, one or more patterns, and`
			`\| a callback function to act on the matches. The callback function will`
			`\| receive the arguments #[code matcher], #[code doc], #[code i] and`
			`\| #[code matches]. If a pattern already exists for the given ID, the`
			`\| patterns will be extended. An #[code on_match] callback will be`
			`\| overwritten.`

			`+aside-code("Example").`
			`def on_match(matcher, doc, id, matches):`
			`print('Matched!', matches)`

			`matcher = PhraseMatcher(nlp.vocab)`
			`matcher.add('OBAMA', on_match, nlp(u"Barack Obama"))`
			`matcher.add('HEALTH', on_match, nlp(u"health care reform"),`
			`nlp(u"healthcare reform"))`
			`doc = nlp(u"Barack Obama urges Congress to find courage to defend his healthcare reforms")`
			`matches = matcher(doc)`

			`+table(["Name", "Type", "Description"])`
			`+row`
			`+cell #[code match_id]`
			`+cell unicode`
			`+cell An ID for the thing you're matching.`

			`+row`
			`+cell #[code on_match]`
			`+cell callable or #[code None]`
			`+cell`
			`\| Callback function to act on matches. Takes the arguments`
			`\| #[code matcher], #[code doc], #[code i] and #[code matches].`

			`+row`
			`+cell #[code *docs]`
			`+cell list`
			`+cell`
			`\| #[code Doc] objects of the phrases to match.`