Update pipeline component examples to use plac

This commit is contained in:
ines 2017-10-27 02:58:14 +02:00
parent af28ca1ba0
commit 44f83b35bc
3 changed files with 129 additions and 59 deletions

View File

@ -1,35 +1,60 @@
#!/usr/bin/env python
# coding: utf-8
"""This example contains several snippets of methods that can be set via custom
Doc, Token or Span attributes in spaCy v2.0. Attribute methods act like
they're "bound" to the object and are partially applied i.e. the object
they're called on is passed in as the first argument."""
they're called on is passed in as the first argument.
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Developed for: spaCy 2.0.0a17
Last updated for: spaCy 2.0.0a18
"""
from __future__ import unicode_literals
import plac
from spacy.lang.en import English
from spacy.tokens import Doc, Span
from spacy import displacy
from pathlib import Path
@plac.annotations(
output_dir=("Output directory for saved HTML", "positional", None, Path))
def main(output_dir=None):
nlp = English() # start off with blank English class
Doc.set_extension('overlap', method=overlap_tokens)
doc1 = nlp(u"Peach emoji is where it has always been.")
doc2 = nlp(u"Peach is the superior emoji.")
print("Text 1:", doc1.text)
print("Text 2:", doc2.text)
print("Overlapping tokens:", doc1._.overlap(doc2))
Doc.set_extension('to_html', method=to_html)
doc = nlp(u"This is a sentence about Apple.")
# add entity manually for demo purposes, to make it work without a model
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
print("Text:", doc.text)
doc._.to_html(output=output_dir, style='ent')
def to_html(doc, output='/tmp', style='dep'):
"""Doc method extension for saving the current state as a displaCy
visualization.
"""
# generate filename from first six non-punct tokens
file_name = '-'.join([w.text for w in doc[:6] if not w.is_punct]) + '.html'
output_path = Path(output) / file_name
html = displacy.render(doc, style=style, page=True) # render markup
output_path.open('w', encoding='utf-8').write(html) # save to file
print('Saved HTML to {}'.format(output_path))
Doc.set_extension('to_html', method=to_html)
nlp = English()
doc = nlp(u"This is a sentence about Apple.")
# add entity manually for demo purposes, to make it work without a model
doc.ents = [Span(doc, 5, 6, label=nlp.vocab.strings['ORG'])]
doc._.to_html(style='ent')
if output is not None:
output_path = Path(output)
if not output_path.exists():
output_path.mkdir()
output_file = Path(output) / file_name
output_file.open('w', encoding='utf-8').write(html) # save to file
print('Saved HTML to {}'.format(output_file))
else:
print(html)
def overlap_tokens(doc, other_doc):
@ -43,10 +68,10 @@ def overlap_tokens(doc, other_doc):
return overlap
Doc.set_extension('overlap', method=overlap_tokens)
if __name__ == '__main__':
plac.call(main)
nlp = English()
doc1 = nlp(u"Peach emoji is where it has always been.")
doc2 = nlp(u"Peach is the superior emoji.")
tokens = doc1._.overlap(doc2)
print(tokens)
# Expected output:
# Text 1: Peach emoji is where it has always been.
# Text 2: Peach is the superior emoji.
# Overlapping tokens: [Peach, emoji, is, .]

View File

@ -1,21 +1,45 @@
# coding: utf-8
#!/usr/bin/env python
# coding: utf8
"""Example of a spaCy v2.0 pipeline component that requests all countries via
the REST Countries API, merges country names into one token, assigns entity
labels and sets attributes on country tokens, e.g. the capital and lat/lng
coordinates. Can be extended with more details from the API.
* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Developed for: spaCy 2.0.0a17
Last updated for: spaCy 2.0.0a18
"""
from __future__ import unicode_literals
import requests
import plac
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
class RESTCountriesComponent(object):
"""Example of a spaCy v2.0 pipeline component that requests all countries
via the REST Countries API, merges country names into one token, assigns
entity labels and sets attributes on country tokens, e.g. the capital and
lat/lng coordinates. Can be extended with more details from the API.
def main():
# For simplicity, we start off with only the blank English Language class
# and no model or pre-defined pipeline loaded.
nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp(u"Some text about Colombia and the Czech Republic")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Doc has countries', doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng,
token._.country_flag) # country data
print('Entities', [(e.text, e.label_) for e in doc.ents]) # entities
REST Countries API: https://restcountries.eu
API License: Mozilla Public License MPL 2.0
class RESTCountriesComponent(object):
"""spaCy v2.0 pipeline component that requests all countries via
the REST Countries API, merges country names into one token, assigns entity
labels and sets attributes on country tokens.
"""
name = 'rest_countries' # component name, will show up in the pipeline
@ -90,19 +114,12 @@ class RESTCountriesComponent(object):
return any([t._.get('is_country') for t in tokens])
# For simplicity, we start off with only the blank English Language class and
# no model or pre-defined pipeline loaded.
if __name__ == '__main__':
plac.call(main)
nlp = English()
rest_countries = RESTCountriesComponent(nlp) # initialise component
nlp.add_pipe(rest_countries) # add it to the pipeline
doc = nlp(u"Some text about Colombia and the Czech Republic")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Doc has countries', doc._.has_country) # Doc contains countries
for token in doc:
if token._.is_country:
print(token.text, token._.country_capital, token._.country_latlng,
token._.country_flag) # country data
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all countries are entities
# Expected output:
# Pipeline ['rest_countries']
# Doc has countries True
# Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
# Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
# Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]

View File

@ -1,11 +1,45 @@
# coding: utf-8
#!/usr/bin/env python
# coding: utf8
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
based on list of single or multiple-word company names. Companies are
labelled as ORG and their spans are merged into one token. Additionally,
._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token
respectively.
* Custom pipeline components: https://alpha.spacy.io//usage/processing-pipelines#custom-components
Developed for: spaCy 2.0.0a17
Last updated for: spaCy 2.0.0a18
"""
from __future__ import unicode_literals
import plac
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
@plac.annotations(
text=("Text to process", "positional", None, str),
companies=("Names of technology companies", "positional", None, str))
def main(text="Alphabet Inc. is the company behind Google.", *companies):
# For simplicity, we start off with only the blank English Language class
# and no model or pre-defined pipeline loaded.
nlp = English()
if not companies: # set default companies if none are set via args
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
component = TechCompanyRecognizer(nlp, companies) # initialise component
nlp.add_pipe(component, last=True) # add last to the pipeline
doc = nlp(text)
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Tokens', [t.text for t in doc]) # company names from the list are merged
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
class TechCompanyRecognizer(object):
"""Example of a spaCy v2.0 pipeline component that sets entity annotations
based on list of single or multiple-word company names. Companies are
@ -67,19 +101,13 @@ class TechCompanyRecognizer(object):
return any([t._.get('is_tech_org') for t in tokens])
# For simplicity, we start off with only the blank English Language class and
# no model or pre-defined pipeline loaded.
if __name__ == '__main__':
plac.call(main)
nlp = English()
companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple'] # etc.
component = TechCompanyRecognizer(nlp, companies) # initialise component
nlp.add_pipe(component, last=True) # add it to the pipeline as the last element
doc = nlp(u"Alphabet Inc. is the company behind Google.")
print('Pipeline', nlp.pipe_names) # pipeline contains component name
print('Tokens', [t.text for t in doc]) # company names from the list are merged
print('Doc has_tech_org', doc._.has_tech_org) # Doc contains tech orgs
print('Token 0 is_tech_org', doc[0]._.is_tech_org) # "Alphabet Inc." is a tech org
print('Token 1 is_tech_org', doc[1]._.is_tech_org) # "is" is not
print('Entities', [(e.text, e.label_) for e in doc.ents]) # all orgs are entities
# Expected output:
# Pipeline ['tech_companies']
# Tokens ['Alphabet Inc.', 'is', 'the', 'company', 'behind', 'Google', '.']
# Doc has_tech_org True
# Token 0 is_tech_org True
# Token 1 is_tech_org False
# Entities [('Alphabet Inc.', 'ORG'), ('Google', 'ORG')]