* Add examples for Matcher, to answer Issue #105. TODO: Integrate into docs properly.

2015-09-27 18:08:00 +10:00 · 2015-09-27 18:08:00 +10:00 · c17e2f2f20
parent 60fbbfcaa2
commit c17e2f2f20
1 changed files with 133 additions and 0 deletions
--- a/examples/matcher_example.py
+++ b/examples/matcher_example.py
@ -0,0 +1,133 @@
+from __future__ import unicode_literals, print_function
+
+import spacy.en
+import spacy.matcher
+from spacy.attrs import ORTH, TAG, LOWER, IS_ALPHA, FLAG63
+
+import plac
+
+
+def main():
+    nlp = spacy.en.English()
+    example = u"I prefer Siri to Google Now. I'll google now to find out how the google now service works."
+    before = nlp(example)
+    print("Before")
+    for ent in before.ents:
+        print(ent.text, ent.label_, [w.tag_ for w in ent])
+    nlp.matcher.add(
+        "GoogleNow", # Entity ID: Not really used at the moment.
+        "PRODUCT",   # Entity type: should be one of the types in the NER data
+        {"wiki_en": "Google_Now"}, # Arbitrary attributes. Currently unused.
+        [  # List of patterns that can be Surface Forms of the entity
+
+            # This Surface Form matches "Google Now", verbatim
+            [ # Each Surface Form is a list of Token Specifiers.
+                { # This Token Specifier matches tokens whose orth field is "Google"
+                    ORTH: "Google"
+                },
+                { # This Token Specifier matches tokens whose orth field is "Now"
+                    ORTH: "Now"
+                }
+            ],
+            [ # This Surface Form matches "google now", verbatim, and requires
+              # "google" to have the NNP tag. This helps prevent the pattern from
+              # matching cases like "I will google now to look up the time"
+                {
+                    ORTH: "google",
+                    TAG: "NNP"
+                },
+                {
+                    ORTH: "now"
+                }
+            ]
+        ]
+    )
+    after = nlp(example)
+    print("After")
+    for ent in after.ents:
+        print(ent.text, ent.label_, [w.tag_ for w in ent])
+    # You can customize attribute values in the lexicon, and then refer to the
+    # new attributes in your Token Specifiers.
+    # This is particularly good for word-set membership.
+    # 
+    australian_capitals = ['Brisbane', 'Sydney', 'Canberra', 'Melbourne', 'Hobart',
+                           'Darwin', 'Adelaide', 'Perth']
+    # Internally, the tokenizer immediately maps each token to a pointer to a 
+    # LexemeC struct. These structs hold various features, e.g. the integer IDs
+    # of the normalized string forms.
+    # For our purposes, the key attribute is a 64-bit integer, used as a bit field.
+    # spaCy currently only uses 12 of the bits for its built-in features, so
+    # the others are available for use. It's best to use the higher bits, as
+    # future versions of spaCy may add more flags. For instance, we might add
+    # a built-in IS_MONTH flag, taking up FLAG13. So, we bind our user-field to
+    # FLAG63 here.
+    is_australian_capital = FLAG63
+    # Now we need to set the flag value. It's False on all tokens by default,
+    # so we just need to set it to True for the tokens we want.
+    # Here we iterate over the strings, and set it on only the literal matches.
+    for string in australian_capitals:
+        lexeme = nlp.vocab[string]
+        lexeme.set_flag(is_australian_capital, True)
+    print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
+    print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
+    # If we want case-insensitive matching, we have to be a little bit more
+    # round-about, as there's no case-insensitive index to the vocabulary. So
+    # we have to iterate over the vocabulary.
+    # We'll be looking up attribute IDs in this set a lot, so it's good to pre-build it
+    target_ids = {nlp.vocab.strings[s.lower()] for s in australian_capitals}
+    for lexeme in nlp.vocab:
+        if lexeme.lower in target_ids:
+            lexeme.set_flag(is_australian_capital, True)
+    print('Sydney', nlp.vocab[u'Sydney'].check_flag(is_australian_capital))
+    print('sydney', nlp.vocab[u'sydney'].check_flag(is_australian_capital))
+    print('SYDNEY', nlp.vocab[u'SYDNEY'].check_flag(is_australian_capital))
+    # Now, let's use this in a pattern
+    nlp.matcher.add("AuCitySportsTeam", "ORG", {},
+        [
+            [
+                {LOWER: "the"},
+                {is_australian_capital: True},
+                {TAG: "NNS"}
+            ],
+            [
+                {LOWER: "the"},
+                {is_australian_capital: True},
+                {TAG: "NNPS"}
+            ],
+            [
+                {LOWER: "the"},
+                {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
+                {is_australian_capital: True},
+                {TAG: "NNS"}
+            ],
+            [
+                {LOWER: "the"},
+                {IS_ALPHA: True}, # Allow a word in between, e.g. The Western Sydney
+                {is_australian_capital: True},
+                {TAG: "NNPS"}
+            ]
+        ])
+    doc = nlp(u'The pattern should match the Brisbane Broncos and the South Darwin Spiders, but not the Colorado Boulders')
+    for ent in doc.ents:
+        print(ent.text, ent.label_)
+
+# Output
+# Before
+# Google ORG [u'NNP']
+# google ORG [u'VB']
+# google ORG [u'NNP']
+# After
+# Google Now PRODUCT [u'NNP', u'RB']
+# google ORG [u'VB']
+# google now PRODUCT [u'NNP', u'RB']
+# Sydney True
+# sydney False
+# Sydney True
+# sydney True
+# SYDNEY True
+# the Brisbane Broncos ORG
+# the South Darwin Spiders ORG
+
+if __name__ == '__main__':
+    main()
+