Update pipeline component examples to use plac

2017-10-27 02:58:14 +02:00 · 2017-10-27 02:58:14 +02:00 · 44f83b35bc
parent af28ca1ba0
commit 44f83b35bc
3 changed files with 129 additions and 59 deletions
--- a/examples/pipeline/custom_attr_methods.py
+++ b/examples/pipeline/custom_attr_methods.py
@ -1,35 +1,60 @@
+#!/usr/bin/env python
 # coding: utf-8
 """This example contains several snippets of methods that can be set via custom
 Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
 they're "bound" to the object and are partially applied – i.e. the object
-they're called on is passed in as the first argument."""
+they're called on is passed in as the first argument.
+
+* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
+
+Developed for: spaCy 2.0.0a17
+Last updated for: spaCy 2.0.0a18
+"""
 from __future__ import unicode_literals

+import plac
 from spacy.lang.en import English
 from spacy.tokens import Doc, Span
 from spacy import displacy
 from pathlib import Path


+@plac.annotations(
+    output_dir=("Output directory for saved HTML", "positional", None, Path))
+def main(output_dir=None):
+    nlp = English()  # start off with blank English class
+
+    Doc.set_extension('overlap', method=overlap_tokens)
+    doc1 = nlp(u"Peach emoji is where it has always been.")
+    doc2 = nlp(u"Peach is the superior emoji.")
+    print("Text 1:", doc1.text)
+    print("Text 2:", doc2.text)
+    print("Overlapping tokens:", doc1._.overlap(doc2))
+
+    Doc.set_extension('to_html', method=to_html)
+    doc = nlp(u"This is a sentence about Apple.")
+    # add entity manually for demo purposes, to make it work without a model
+    doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
+    print("Text:", doc.text)
+    doc._.to_html(output=output_dir, style='ent')
+
+
 def to_html(doc, output='/tmp', style='dep'):
    """Doc method extension for saving the current state as a displaCy
    visualization.
    """
    # generate filename from first six non-punct tokens
    file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
-    output_path = Path(output) / file_name
    html = displacy.render(doc, style=style, page=True)  # render markup
-    output_path.open('w', encoding='utf-8').write(html)  # save to file
-    print('Saved HTML to {}'.format(output_path))
-
-
-Doc.set_extension('to_html', method=to_html)
-
-nlp = English()
-doc = nlp(u"This is a sentence about Apple.")
-# add entity manually for demo purposes, to make it work without a model
-doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
-doc._.to_html(style='ent')
+    if output is not None:
+        output_path = Path(output)
+        if not output_path.exists():
+            output_path.mkdir()
+        output_file = Path(output) / file_name
+        output_file.open('w', encoding='utf-8').write(html)  # save to file
+        print('Saved HTML to {}'.format(output_file))
+    else:
+        print(html)


 def overlap_tokens(doc, other_doc):
@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc):
    return overlap


-Doc.set_extension('overlap', method=overlap_tokens)
+if __name__ == '__main__':
+    plac.call(main)

-nlp = English()
-doc1 = nlp(u"Peach emoji is where it has always been.")
-doc2 = nlp(u"Peach is the superior emoji.")
-tokens = doc1._.overlap(doc2)
-print(tokens)
+    # Expected output:
+    # Text 1: Peach emoji is where it has always been.
+    # Text 2: Peach is the superior emoji.
+    # Overlapping tokens: [Peach, emoji, is, .]
--- a/examples/pipeline/custom_component_countries_api.py
+++ b/examples/pipeline/custom_component_countries_api.py
@ -1,21 +1,45 @@
-# coding: utf-8
+#!/usr/bin/env python
+# coding: utf8
+"""Example of a spaCy v2.0 pipeline component that requests all countries via
+the REST Countries API, merges country names into one token, assigns entity
+labels and sets attributes on country tokens, e.g. the capital and lat/lng
+coordinates. Can be extended with more details from the API.
+
+* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
+* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
+
+Developed for: spaCy 2.0.0a17
+Last updated for: spaCy 2.0.0a18
+"""
 from __future__ import unicode_literals

 import requests
-
+import plac
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token


-class RESTCountriesComponent(object):
-    """Example of a spaCy v2.0 pipeline component that requests all countries
-    via the REST Countries API, merges country names into one token, assigns
-    entity labels and sets attributes on country tokens, e.g. the capital and
-    lat/lng coordinates. Can be extended with more details from the API.
+def main():
+    # For simplicity, we start off with only the blank English Language class
+    # and no model or pre-defined pipeline loaded.
+    nlp = English()
+    rest_countries = RESTCountriesComponent(nlp)  # initialise component
+    nlp.add_pipe(rest_countries) # add it to the pipeline
+    doc = nlp(u"Some text about Colombia and the Czech Republic")
+    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+    print('Doc has countries', doc._.has_country)  # Doc contains countries
+    for token in doc:
+        if token._.is_country:
+            print(token.text, token._.country_capital, token._.country_latlng,
+                token._.country_flag)  # country data
+    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities

-    REST Countries API: https://restcountries.eu
-    API License: Mozilla Public License MPL 2.0
+
+class RESTCountriesComponent(object):
+    """spaCy v2.0 pipeline component that requests all countries via
+    the REST Countries API, merges country names into one token, assigns entity
+    labels and sets attributes on country tokens.
    """
    name = 'rest_countries' # component name, will show up in the pipeline

@ -90,19 +114,12 @@ class RESTCountriesComponent(object):
        return any([t._.get('is_country') for t in tokens])


-# For simplicity, we start off with only the blank English Language class and
-# no model or pre-defined pipeline loaded.
+if __name__ == '__main__':
+    plac.call(main)

-nlp = English()
-rest_countries = RESTCountriesComponent(nlp)  # initialise component
-nlp.add_pipe(rest_countries) # add it to the pipeline
-
-doc = nlp(u"Some text about Colombia and the Czech Republic")
-
-print('Pipeline', nlp.pipe_names)  # pipeline contains component name
-print('Doc has countries', doc._.has_country)  # Doc contains countries
-for token in doc:
-    if token._.is_country:
-        print(token.text, token._.country_capital, token._.country_latlng,
-              token._.country_flag)  # country data
-print('Entities', [(e.text, e.label_) for e in doc.ents])  # all countries are entities
+    # Expected output:
+    # Pipeline ['rest_countries']
+    # Doc has countries True
+    # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
+    # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
+    # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
--- a/examples/pipeline/custom_component_entities.py
+++ b/examples/pipeline/custom_component_entities.py
@ -1,11 +1,45 @@
-# coding: utf-8
+#!/usr/bin/env python
+# coding: utf8
+"""Example of a spaCy v2.0 pipeline component that sets entity annotations
+based on list of single or multiple-word company names. Companies are
+labelled as ORG and their spans are merged into one token. Additionally,
+._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
+respectively.
+
+* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
+
+Developed for: spaCy 2.0.0a17
+Last updated for: spaCy 2.0.0a18
+"""
 from __future__ import unicode_literals

+import plac
 from spacy.lang.en import English
 from spacy.matcher import PhraseMatcher
 from spacy.tokens import Doc, Span, Token


+@plac.annotations(
+    text=("Text to process", "positional", None, str),
+    companies=("Names of technology companies", "positional", None, str))
+def main(text="Alphabet Inc. is the company behind Google.", *companies):
+    # For simplicity, we start off with only the blank English Language class
+    # and no model or pre-defined pipeline loaded.
+    nlp = English()
+    if not companies:  # set default companies if none are set via args
+        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
+    component = TechCompanyRecognizer(nlp, companies)  # initialise component
+    nlp.add_pipe(component, last=True)  # add last to the pipeline
+
+    doc = nlp(text)
+    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
+    print('Tokens', [t.text for t in doc])  # company names from the list are merged
+    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
+    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
+    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
+    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
+
+
 class TechCompanyRecognizer(object):
    """Example of a spaCy v2.0 pipeline component that sets entity annotations
    based on list of single or multiple-word company names. Companies are
@ -67,19 +101,13 @@ class TechCompanyRecognizer(object):
        return any([t._.get('is_tech_org') for t in tokens])


-# For simplicity, we start off with only the blank English Language class and
-# no model or pre-defined pipeline loaded.
+if __name__ == '__main__':
+    plac.call(main)

-nlp = English()
-companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
-component = TechCompanyRecognizer(nlp, companies)  # initialise component
-nlp.add_pipe(component, last=True)  # add it to the pipeline as the last element
-
-doc = nlp(u"Alphabet Inc. is the company behind Google.")
-
-print('Pipeline', nlp.pipe_names)  # pipeline contains component name
-print('Tokens', [t.text for t in doc])  # company names from the list are merged
-print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
-print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
-print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
-print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
+    # Expected output:
+    # Pipeline ['tech_companies']
+    # Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
+    # Doc has_tech_org True
+    # Token 0 is_tech_org True
+    # Token 1 is_tech_org False
+    # Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]