diff --git a/examples/pipeline/custom_attr_methods.py b/examples/pipeline/custom_attr_methods.py index 9b1a8325d..741541b06 100644 --- a/examples/pipeline/custom_attr_methods.py +++ b/examples/pipeline/custom_attr_methods.py @@ -1,35 +1,60 @@ +#!/usr/bin/env python # coding: utf-8 """This example contains several snippets of methods that can be set via custom Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like they're "bound" to the object and are partially applied – i.e. the object -they're called on is passed in as the first argument.""" +they're called on is passed in as the first argument. + +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals +import plac from spacy.lang.en import English from spacy.tokens import Doc, Span from spacy import displacy from pathlib import Path +@plac.annotations( + output_dir=("Output directory for saved HTML", "positional", None, Path)) +def main(output_dir=None): + nlp = English() # start off with blank English class + + Doc.set_extension('overlap', method=overlap_tokens) + doc1 = nlp(u"Peach emoji is where it has always been.") + doc2 = nlp(u"Peach is the superior emoji.") + print("Text 1:", doc1.text) + print("Text 2:", doc2.text) + print("Overlapping tokens:", doc1._.overlap(doc2)) + + Doc.set_extension('to_html', method=to_html) + doc = nlp(u"This is a sentence about Apple.") + # add entity manually for demo purposes, to make it work without a model + doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] + print("Text:", doc.text) + doc._.to_html(output=output_dir, style='ent') + + def to_html(doc, output='/tmp', style='dep'): """Doc method extension for saving the current state as a displaCy visualization. """ # generate filename from first six non-punct tokens file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html' - output_path = Path(output) / file_name html = displacy.render(doc, style=style, page=True) # render markup - output_path.open('w', encoding='utf-8').write(html) # save to file - print('Saved HTML to {}'.format(output_path)) - - -Doc.set_extension('to_html', method=to_html) - -nlp = English() -doc = nlp(u"This is a sentence about Apple.") -# add entity manually for demo purposes, to make it work without a model -doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])] -doc._.to_html(style='ent') + if output is not None: + output_path = Path(output) + if not output_path.exists(): + output_path.mkdir() + output_file = Path(output) / file_name + output_file.open('w', encoding='utf-8').write(html) # save to file + print('Saved HTML to {}'.format(output_file)) + else: + print(html) def overlap_tokens(doc, other_doc): @@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc): return overlap -Doc.set_extension('overlap', method=overlap_tokens) +if __name__ == '__main__': + plac.call(main) -nlp = English() -doc1 = nlp(u"Peach emoji is where it has always been.") -doc2 = nlp(u"Peach is the superior emoji.") -tokens = doc1._.overlap(doc2) -print(tokens) + # Expected output: + # Text 1: Peach emoji is where it has always been. + # Text 2: Peach is the superior emoji. + # Overlapping tokens: [Peach, emoji, is, .] diff --git a/examples/pipeline/custom_component_countries_api.py b/examples/pipeline/custom_component_countries_api.py index 2554af967..38eec7384 100644 --- a/examples/pipeline/custom_component_countries_api.py +++ b/examples/pipeline/custom_component_countries_api.py @@ -1,21 +1,45 @@ -# coding: utf-8 +#!/usr/bin/env python +# coding: utf8 +"""Example of a spaCy v2.0 pipeline component that requests all countries via +the REST Countries API, merges country names into one token, assigns entity +labels and sets attributes on country tokens, e.g. the capital and lat/lng +coordinates. Can be extended with more details from the API. + +* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0) +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals import requests - +import plac from spacy.lang.en import English from spacy.matcher import PhraseMatcher from spacy.tokens import Doc, Span, Token -class RESTCountriesComponent(object): - """Example of a spaCy v2.0 pipeline component that requests all countries - via the REST Countries API, merges country names into one token, assigns - entity labels and sets attributes on country tokens, e.g. the capital and - lat/lng coordinates. Can be extended with more details from the API. +def main(): + # For simplicity, we start off with only the blank English Language class + # and no model or pre-defined pipeline loaded. + nlp = English() + rest_countries = RESTCountriesComponent(nlp) # initialise component + nlp.add_pipe(rest_countries) # add it to the pipeline + doc = nlp(u"Some text about Colombia and the Czech Republic") + print('Pipeline', nlp.pipe_names) # pipeline contains component name + print('Doc has countries', doc._.has_country) # Doc contains countries + for token in doc: + if token._.is_country: + print(token.text, token._.country_capital, token._.country_latlng, + token._.country_flag) # country data + print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities - REST Countries API: https://restcountries.eu - API License: Mozilla Public License MPL 2.0 + +class RESTCountriesComponent(object): + """spaCy v2.0 pipeline component that requests all countries via + the REST Countries API, merges country names into one token, assigns entity + labels and sets attributes on country tokens. """ name = 'rest_countries' # component name, will show up in the pipeline @@ -90,19 +114,12 @@ class RESTCountriesComponent(object): return any([t._.get('is_country') for t in tokens]) -# For simplicity, we start off with only the blank English Language class and -# no model or pre-defined pipeline loaded. +if __name__ == '__main__': + plac.call(main) -nlp = English() -rest_countries = RESTCountriesComponent(nlp) # initialise component -nlp.add_pipe(rest_countries) # add it to the pipeline - -doc = nlp(u"Some text about Colombia and the Czech Republic") - -print('Pipeline', nlp.pipe_names) # pipeline contains component name -print('Doc has countries', doc._.has_country) # Doc contains countries -for token in doc: - if token._.is_country: - print(token.text, token._.country_capital, token._.country_latlng, - token._.country_flag) # country data -print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities + # Expected output: + # Pipeline ['rest_countries'] + # Doc has countries True + # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg + # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg + # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')] diff --git a/examples/pipeline/custom_component_entities.py b/examples/pipeline/custom_component_entities.py index a0d9c61ec..050a89905 100644 --- a/examples/pipeline/custom_component_entities.py +++ b/examples/pipeline/custom_component_entities.py @@ -1,11 +1,45 @@ -# coding: utf-8 +#!/usr/bin/env python +# coding: utf8 +"""Example of a spaCy v2.0 pipeline component that sets entity annotations +based on list of single or multiple-word company names. Companies are +labelled as ORG and their spans are merged into one token. Additionally, +._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token +respectively. + +* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components + +Developed for: spaCy 2.0.0a17 +Last updated for: spaCy 2.0.0a18 +""" from __future__ import unicode_literals +import plac from spacy.lang.en import English from spacy.matcher import PhraseMatcher from spacy.tokens import Doc, Span, Token +@plac.annotations( + text=("Text to process", "positional", None, str), + companies=("Names of technology companies", "positional", None, str)) +def main(text="Alphabet Inc. is the company behind Google.", *companies): + # For simplicity, we start off with only the blank English Language class + # and no model or pre-defined pipeline loaded. + nlp = English() + if not companies: # set default companies if none are set via args + companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. + component = TechCompanyRecognizer(nlp, companies) # initialise component + nlp.add_pipe(component, last=True) # add last to the pipeline + + doc = nlp(text) + print('Pipeline', nlp.pipe_names) # pipeline contains component name + print('Tokens', [t.text for t in doc]) # company names from the list are merged + print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs + print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org + print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not + print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities + + class TechCompanyRecognizer(object): """Example of a spaCy v2.0 pipeline component that sets entity annotations based on list of single or multiple-word company names. Companies are @@ -67,19 +101,13 @@ class TechCompanyRecognizer(object): return any([t._.get('is_tech_org') for t in tokens]) -# For simplicity, we start off with only the blank English Language class and -# no model or pre-defined pipeline loaded. +if __name__ == '__main__': + plac.call(main) -nlp = English() -companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc. -component = TechCompanyRecognizer(nlp, companies) # initialise component -nlp.add_pipe(component, last=True) # add it to the pipeline as the last element - -doc = nlp(u"Alphabet Inc. is the company behind Google.") - -print('Pipeline', nlp.pipe_names) # pipeline contains component name -print('Tokens', [t.text for t in doc]) # company names from the list are merged -print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs -print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org -print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not -print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities + # Expected output: + # Pipeline ['tech_companies'] + # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.'] + # Doc has_tech_org True + # Token 0 is_tech_org True + # Token 1 is_tech_org False + # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]