diff --git a/examples/matcher_example.py b/examples/matcher_example.py new file mode 100644 index 000000000..d12fc8319 --- /dev/null +++ b/examples/matcher_example.py @@ -0,0 +1,133 @@ +from __future__ import unicode_literals, print_function + +import spacy.en +import spacy.matcher +from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63 + +import plac + + +def main(): + nlp = spacy.en.English() + example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works." + before = nlp(example) + print("Before") + for ent in before.ents: + print(ent.text, ent.label_, [w.tag_ for w in ent]) + nlp.matcher.add( + "GoogleNow", # Entity ID: Not really used at the moment. + "PRODUCT", # Entity type: should be one of the types in the NER data + {"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused. + [ # List of patterns that can be Surface Forms of the entity + + # This Surface Form matches "Google Now", verbatim + [ # Each Surface Form is a list of Token Specifiers. + { # This Token Specifier matches tokens whose orth field is "Google" + ORTH: "Google" + }, + { # This Token Specifier matches tokens whose orth field is "Now" + ORTH: "Now" + } + ], + [ # This Surface Form matches "google now", verbatim, and requires + # "google" to have the NNP tag. This helps prevent the pattern from + # matching cases like "I will google now to look up the time" + { + ORTH: "google", + TAG: "NNP" + }, + { + ORTH: "now" + } + ] + ] + ) + after = nlp(example) + print("After") + for ent in after.ents: + print(ent.text, ent.label_, [w.tag_ for w in ent]) + # You can customize attribute values in the lexicon, and then refer to the + # new attributes in your Token Specifiers. + # This is particularly good for word-set membership. + # + australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart', + 'Darwin', 'Adelaide', 'Perth'] + # Internally, the tokenizer immediately maps each token to a pointer to a + # LexemeC struct. These structs hold various features, e.g. the integer IDs + # of the normalized string forms. + # For our purposes, the key attribute is a 64-bit integer, used as a bit field. + # spaCy currently only uses 12 of the bits for its built-in features, so + # the others are available for use. It's best to use the higher bits, as + # future versions of spaCy may add more flags. For instance, we might add + # a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to + # FLAG63 here. + is_australian_capital = FLAG63 + # Now we need to set the flag value. It's False on all tokens by default, + # so we just need to set it to True for the tokens we want. + # Here we iterate over the strings, and set it on only the literal matches. + for string in australian_capitals: + lexeme = nlp.vocab[string] + lexeme.set_flag(is_australian_capital, True) + print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital)) + print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital)) + # If we want case-insensitive matching, we have to be a little bit more + # round-about, as there's no case-insensitive index to the vocabulary. So + # we have to iterate over the vocabulary. + # We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it + target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals} + for lexeme in nlp.vocab: + if lexeme.lower in target_ids: + lexeme.set_flag(is_australian_capital, True) + print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital)) + print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital)) + print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital)) + # Now, let's use this in a pattern + nlp.matcher.add("AuCitySportsTeam", "ORG", {}, + [ + [ + {LOWER: "the"}, + {is_australian_capital: True}, + {TAG: "NNS"} + ], + [ + {LOWER: "the"}, + {is_australian_capital: True}, + {TAG: "NNPS"} + ], + [ + {LOWER: "the"}, + {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney + {is_australian_capital: True}, + {TAG: "NNS"} + ], + [ + {LOWER: "the"}, + {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney + {is_australian_capital: True}, + {TAG: "NNPS"} + ] + ]) + doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders') + for ent in doc.ents: + print(ent.text, ent.label_) + +# Output +# Before +# Google ORG [u'NNP'] +# google ORG [u'VB'] +# google ORG [u'NNP'] +# After +# Google Now PRODUCT [u'NNP', u'RB'] +# google ORG [u'VB'] +# google now PRODUCT [u'NNP', u'RB'] +# Sydney True +# sydney False +# Sydney True +# sydney True +# SYDNEY True +# the Brisbane Broncos ORG +# the South Darwin Spiders ORG + +if __name__ == '__main__': + main() +