From fb923b31ea3a4b815e3ef3d28dad126d375aa9b3 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 26 May 2018 17:57:02 +0200 Subject: [PATCH] Fix bad HTML example (see #2376) and turn it into section on matcher + components Avoid problems caused by merging while matching (e.g. index errors). Creating a Matcher component also better reflects the recommended best practices. --- .../_rule-based-matching.jade | 100 +++++++++++------- 1 file changed, 64 insertions(+), 36 deletions(-) diff --git a/website/usage/_linguistic-features/_rule-based-matching.jade b/website/usage/_linguistic-features/_rule-based-matching.jade index c0d418d46..094f15b90 100644 --- a/website/usage/_linguistic-features/_rule-based-matching.jade +++ b/website/usage/_linguistic-features/_rule-based-matching.jade @@ -260,41 +260,6 @@ p doc = nlp(u"This is a text about Google I/O 2015.") matches = matcher(doc) -p - | In addition to mentions of "Google I/O", your data also contains some - | annoying pre-processing artefacts, like leftover HTML line breaks - | (e.g. #[code <br>] or #[code <BR/>]). While you're at it, - | you want to merge those into one token and flag them, to make sure you - | can easily ignore them later. So you add a second pattern and pass in a - | function #[code merge_and_flag]: - -+code-exec. - import spacy - from spacy.matcher import Matcher - from spacy.tokens import Token - - nlp = spacy.load('en_core_web_sm') - matcher = Matcher(nlp.vocab) - # register a new token extension to flag bad HTML - Token.set_extension('bad_html', default=False) - - def merge_and_flag(matcher, doc, i, matches): - match_id, start, end = matches[i] - span = doc[start : end] - span.merge(is_stop=True) # merge (and mark it as a stop word, just in case) - for token in span: - token._.bad_html = True # mark token as bad HTML - print(span.text) - - matcher.add('BAD_HTML', merge_and_flag, - [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], - [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}]) - - doc = nlp(u"Hello<br>world!") - matches = matcher(doc) - for token in doc: - print(token.text, token._.bad_html) - +aside("Tip: Visualizing matches") | When working with entities, you can use #[+api("top-level#displacy") displaCy] | to quickly generate a NER visualization from your updated #[code Doc], @@ -315,7 +280,7 @@ p | that was matched, and invoke it. +code. - doc = nlp(LOTS_OF_TEXT) + doc = nlp(YOUR_TEXT_HERE) matcher(doc) p @@ -348,6 +313,69 @@ p | A list of #[code (match_id, start, end)] tuples, describing the | matches. A match tuple describes a span #[code doc[start:end]]. ++h(3, "matcher-pipeline") Using custom pipeline components + +p + | Let's say your data also contains some annoying pre-processing artefacts, + | like leftover HTML line breaks (e.g. #[code <br>] or + | #[code <BR/>]). To make your text easier to analyse, you want to + | merge those into one token and flag them, to make sure you + | can ignore them later. Ideally, this should all be done automatically + | as you process the text. You can achieve this by adding a + | #[+a("/usage/processing-pipelines#custom-components") custom pipeline component] + | that's called on each #[code Doc] object, merges the leftover HTML spans + | and sets an attribute #[code bad_html] on the token. + ++code-exec. + import spacy + from spacy.matcher import Matcher + from spacy.tokens import Token + + # we're using a class because the component needs to be initialised with + # the shared vocab via the nlp object + class BadHTMLMerger(object): + def __init__(self, nlp): + # register a new token extension to flag bad HTML + Token.set_extension('bad_html', default=False) + self.matcher = Matcher(nlp.vocab) + self.matcher.add('BAD_HTML', None, + [{'ORTH': '<'}, {'LOWER': 'br'}, {'ORTH': '>'}], + [{'ORTH': '<'}, {'LOWER': 'br/'}, {'ORTH': '>'}]) + + def __call__(self, doc): + # this method is invoked when the component is called on a Doc + matches = self.matcher(doc) + spans = [] # collect the matched spans here + for match_id, start, end in matches: + spans.append(doc[start:end]) + for span in spans: + span.merge(is_stop=True) # merge (and mark it as a stop word) + for token in span: + token._.bad_html = True # mark token as bad HTML + return doc + + nlp = spacy.load('en_core_web_sm') + html_merger = BadHTMLMerger(nlp) + nlp.add_pipe(html_merger, last=True) # add component to the pipeline + doc = nlp(u"Hello<br>world! <br/> This is a test.") + for token in doc: + print(token.text, token._.bad_html) + +p + | Instead of hard-coding the patterns into the component, you could also + | make it take a path to a JSON file containing the patterns. This lets + | you reuse the component with different patterns, depending on your + | application: + ++code. + html_merger = BadHTMLMerger(nlp, path='/path/to/patterns.json') + ++infobox + | For more details and examples of how to + | #[strong create custom pipeline components] and + | #[strong extension attributes], see the + | #[+a("/usage/processing-pipelines") usage guide]. + +h(3, "regex") Using regular expressions p