Update emoji/hashtag matcher example (resolves #2156) [ci skip]

This commit is contained in:
ines 2018-03-28 18:41:28 +02:00
parent ac88c72c9a
commit 9615ed5ed7
1 changed files with 28 additions and 19 deletions

View File

@ -527,7 +527,7 @@ p
matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern matcher.add('SAD', label_sentiment, *neg_patterns) # add negative pattern
# add pattern to merge valid hashtag, i.e. '#' plus any ASCII token # add pattern to merge valid hashtag, i.e. '#' plus any ASCII token
matcher.add('HASHTAG', merge_hashtag, [{'ORTH': '#'}, {'IS_ASCII': True}]) matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])
p p
| Because the #[code on_match] callback receives the ID of each match, you | Because the #[code on_match] callback receives the ID of each match, you
@ -541,11 +541,15 @@ p
| #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia], | #[+a("https://github.com/bcongdon/python-emojipedia") Emojipedia],
| we can also retrieve a short description for each emoji for example, | we can also retrieve a short description for each emoji for example,
| #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With | #[span.o-icon.o-icon--inline 😍]'s official title is "Smiling Face With
| Heart-Eyes". Assigning it to the merged token's norm will make it | Heart-Eyes". Assigning it to a
| available as #[code token.norm_]. | #[+a("/usage/processing-pipelines#custom-components-attributes") custom attribute]
| on the emoji span will make it available as #[code span._.emoji_desc].
+code. +code.
from emojipedia import Emojipedia # installation: pip install emojipedia from emojipedia import Emojipedia # installation: pip install emojipedia
from spacy.tokens import Span # get the global Span object
Span.set_extension('emoji_desc', default=None) # register the custom attribute
def label_sentiment(matcher, doc, i, matches): def label_sentiment(matcher, doc, i, matches):
match_id, start, end = matches[i] match_id, start, end = matches[i]
@ -555,22 +559,27 @@ p
doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment doc.sentiment -= 0.1 # subtract 0.1 for negative sentiment
span = doc[start : end] span = doc[start : end]
emoji = Emojipedia.search(span[0].text) # get data for emoji emoji = Emojipedia.search(span[0].text) # get data for emoji
span.merge(norm=emoji.title) # merge span and set NORM to emoji title span._.emoji_desc = emoji.title # assign emoji description
p p
| To label the hashtags, we first need to add a new custom flag. | To label the hashtags, we first need to add a new custom flag.
| #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it | #[code IS_HASHTAG] will be the flag's ID, which you can use to assign it
| to the hashtag's span, and check its value via a token's | to the hashtag's span, and check its value via a token's
| #[+api("token#check_flag") #[code check_flag()]] method. On each | #[+api("token#check_flag") #[code check_flag()]] method. On each
| match, we merge the hashtag and assign the flag. | match, we merge the hashtag and assign the flag. Alternatively, we
| could also use a
| #[+a("/usage/processing-pipelines#custom-components-attributes") custom attribute],
| e.g. #[code token._.is_hashtag].
+code. +code.
# Add a new custom flag to the vocab, which is always False by default # Add a new custom flag to the vocab, which is always False by default
IS_HASHTAG = nlp.vocab.add_flag(lambda text: False) IS_HASHTAG = nlp.vocab.add_flag(lambda text: False)
def merge_hashtag(matcher, doc, i, matches): matches = matcher(doc)
match_id, start, end = matches[i] spans = []
span = doc[start : end] for match_id, start, end in matches:
spans.append(doc[start : end])
for span in spans:
span.merge() # merge hashtag span.merge() # merge hashtag
span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True span.set_flag(IS_HASHTAG, True) # set IS_HASHTAG to True