spaCy/examples/pipeline/custom_component_countries_...

#!/usr/bin/env python
# coding: utf8
"""Example of a spaCy v2.0 pipeline component that requests all countries via
the REST Countries API, merges country names into one token, assigns entity
labels and sets attributes on country tokens, e.g. the capital and lat/lng
coordinates. Can be extended with more details from the API.

* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components

Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
Prerequisites: pip install requests
"""
from __future__ import unicode_literals, print_function

import requests
import plac
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token


def main():
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries)  # add it to the pipeline
    doc = nlp("Some text about Colombia and the Czech Republic")
    print("Pipeline", nlp.pipe_names)  # pipeline contains component name
    print("Doc has countries", doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(
                token.text,
                token._.country_capital,
                token._.country_latlng,
                token._.country_flag,
            )  # country data
    print("Entities", [(e.text, e.label_) for e in doc.ents])  # entities


class RESTCountriesComponent(object):
    """spaCy v2.0 pipeline component that requests all countries via
    the REST Countries API, merges country names into one token, assigns entity
    labels and sets attributes on country tokens.
    """

    name = "rest_countries"  # component name, will show up in the pipeline

    def __init__(self, nlp, label="GPE"):
        """Initialise the pipeline component. The shared nlp instance is used
        to initialise the matcher with the shared vocab, get the label ID and
        generate Doc objects as phrase match patterns.
        """
        # Make request once on initialisation and store the data
        r = requests.get("https://restcountries.eu/rest/v2/all")
        r.raise_for_status()  # make sure requests raises an error if it fails
        countries = r.json()

        # Convert API response to dict keyed by country name for easy lookup
        # This could also be extended using the alternative and foreign language
        # names provided by the API
        self.countries = {c["name"]: c for c in countries}
        self.label = nlp.vocab.strings[label]  # get entity label ID

        # Set up the PhraseMatcher with Doc patterns for each country name
        patterns = [nlp(c) for c in self.countries.keys()]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add("COUNTRIES", None, *patterns)

        # Register attribute on the Token. We'll be overwriting this based on
        # the matches, so we're only setting a default value, not a getter.
        # If no default value is set, it defaults to None.
        Token.set_extension("is_country", default=False)
        Token.set_extension("country_capital", default=False)
        Token.set_extension("country_latlng", default=False)
        Token.set_extension("country_flag", default=False)

        # Register attributes on Doc and Span via a getter that checks if one of
        # the contained tokens is set to is_country == True.
        Doc.set_extension("has_country", getter=self.has_country)
        Span.set_extension("has_country", getter=self.has_country)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.matcher(doc)
        spans = []  # keep the spans for later so we can merge them afterwards
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            entity = Span(doc, start, end, label=self.label)
            spans.append(entity)
            # Set custom attribute on each token of the entity
            # Can be extended with other data returned by the API, like
            # currencies, country code, flag, calling code etc.
            for token in entity:
                token._.set("is_country", True)
                token._.set("country_capital", self.countries[entity.text]["capital"])
                token._.set("country_latlng", self.countries[entity.text]["latlng"])
                token._.set("country_flag", self.countries[entity.text]["flag"])
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
        return doc  # don't forget to return the Doc!

    def has_country(self, tokens):
        """Getter for Doc and Span attributes. Returns True if one of the tokens
        is a country. Since the getter is only called when we access the
        attribute, we can refer to the Token's 'is_country' attribute here,
        which is already set in the processing step."""
        return any([t._.get("is_country") for t in tokens])


if __name__ == "__main__":
    plac.call(main)

    # Expected output:
    # Pipeline ['rest_countries']
    # Doc has countries True
    # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
    # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
    # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								#!/usr/bin/env python
 								# coding: utf8
 								"""Example of a spaCy v2.0 pipeline component that requests all countries via
 								the REST Countries API, merges country names into one token, assigns entity
 								labels and sets attributes on country tokens, e.g. the capital and lat/lng
 								coordinates. Can be extended with more details from the API.
 								* REST Countries API: https://restcountries.eu (Mozilla Public License MPL 2.0)
-												Get docs ready for v2.0.0

											
										
										
											2017-11-07 11:00:43 +00:00
+								* Custom pipeline components: https://spacy.io//usage/processing-pipelines#custom-components
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
-												Update examples

											
										
										
											2017-11-07 00:22:30 +00:00
+								Compatible with: spaCy v2.0.0+
-												Test and update examples [ci skip]

											
										
										
											2019-03-16 13:15:49 +00:00
+								Last tested with: v2.1.0
-												Updatee example with note to install requests

											
										
										
											2018-03-28 10:46:27 +00:00
+								Prerequisites: pip install requests
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								"""
-												Fix examples

											
										
										
											2017-10-27 01:55:04 +00:00
+								from __future__ import unicode_literals, print_function
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
 								import requests
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								import plac
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
+								from spacy.lang.en import English
 								from spacy.matcher import PhraseMatcher
-												Fix consistency of imports from spacy.tokens in examples

											
										
										
											2017-10-11 00:30:40 +00:00
+								from spacy.tokens import Doc, Span, Token
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								def main():
 								    # For simplicity, we start off with only the blank English Language class
 								    # and no model or pre-defined pipeline loaded.
 								    nlp = English()
 								    rest_countries = RESTCountriesComponent(nlp)  # initialise component
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								    nlp.add_pipe(rest_countries)  # add it to the pipeline
 								    doc = nlp("Some text about Colombia and the Czech Republic")
 								    print("Pipeline", nlp.pipe_names)  # pipeline contains component name
 								    print("Doc has countries", doc._.has_country)  # Doc contains countries
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								    for token in doc:
 								        if token._.is_country:
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								            print(
 								                token.text,
 								                token._.country_capital,
 								                token._.country_latlng,
 								                token._.country_flag,
 								            )  # country data
 								    print("Entities", [(e.text, e.label_) for e in doc.ents])  # entities
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								class RESTCountriesComponent(object):
 								    """spaCy v2.0 pipeline component that requests all countries via
 								    the REST Countries API, merges country names into one token, assigns entity
 								    labels and sets attributes on country tokens.
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
+								    """
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								    name = "rest_countries"  # component name, will show up in the pipeline
 								    def __init__(self, nlp, label="GPE"):
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
+								        """Initialise the pipeline component. The shared nlp instance is used
 								        to initialise the matcher with the shared vocab, get the label ID and
 								        generate Doc objects as phrase match patterns.
 								        """
 								        # Make request once on initialisation and store the data
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								        r = requests.get("https://restcountries.eu/rest/v2/all")
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
+								        r.raise_for_status()  # make sure requests raises an error if it fails
 								        countries = r.json()
 								        # Convert API response to dict keyed by country name for easy lookup
 								        # This could also be extended using the alternative and foreign language
 								        # names provided by the API
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								        self.countries = {c["name"]: c for c in countries}
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
+								        self.label = nlp.vocab.strings[label]  # get entity label ID
 								        # Set up the PhraseMatcher with Doc patterns for each country name
 								        patterns = [nlp(c) for c in self.countries.keys()]
 								        self.matcher = PhraseMatcher(nlp.vocab)
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								        self.matcher.add("COUNTRIES", None, *patterns)
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
 								        # Register attribute on the Token. We'll be overwriting this based on
 								        # the matches, so we're only setting a default value, not a getter.
 								        # If no default value is set, it defaults to None.
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								        Token.set_extension("is_country", default=False)
 								        Token.set_extension("country_capital", default=False)
 								        Token.set_extension("country_latlng", default=False)
 								        Token.set_extension("country_flag", default=False)
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
 								        # Register attributes on Doc and Span via a getter that checks if one of
 								        # the contained tokens is set to is_country == True.
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								        Doc.set_extension("has_country", getter=self.has_country)
 								        Span.set_extension("has_country", getter=self.has_country)
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
 								    def __call__(self, doc):
 								        """Apply the pipeline component on a Doc object and modify it if matches
 								        are found. Return the Doc, so it can be processed by the next component
 								        in the pipeline, if available.
 								        """
 								        matches = self.matcher(doc)
 								        spans = []  # keep the spans for later so we can merge them afterwards
 								        for _, start, end in matches:
 								            # Generate Span representing the entity & set label
 								            entity = Span(doc, start, end, label=self.label)
 								            spans.append(entity)
 								            # Set custom attribute on each token of the entity
 								            # Can be extended with other data returned by the API, like
 								            # currencies, country code, flag, calling code etc.
 								            for token in entity:
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								                token._.set("is_country", True)
 								                token._.set("country_capital", self.countries[entity.text]["capital"])
 								                token._.set("country_latlng", self.countries[entity.text]["latlng"])
 								                token._.set("country_flag", self.countries[entity.text]["flag"])
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
+								            # Overwrite doc.ents and add entity – be careful not to replace!
 								            doc.ents = list(doc.ents) + [entity]
 								        for span in spans:
 								            # Iterate over all spans and merge them into one token. This is done
 								            # after setting the entities – otherwise, it would cause mismatched
 								            # indices!
 								            span.merge()
 								        return doc  # don't forget to return the Doc!
 								    def has_country(self, tokens):
 								        """Getter for Doc and Span attributes. Returns True if one of the tokens
 								        is a country. Since the getter is only called when we access the
 								        attribute, we can refer to the Token's 'is_country' attribute here,
 								        which is already set in the processing step."""
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								        return any([t._.get("is_country") for t in tokens])
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
-												Auto-format examples

											
										
										
											2018-12-02 03:26:26 +00:00
+								if __name__ == "__main__":
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								    plac.call(main)
-												Add pipeline component examples

											
										
										
											2017-10-10 02:26:06 +00:00
-												Update pipeline component examples to use plac

											
										
										
											2017-10-27 00:58:14 +00:00
+								    # Expected output:
 								    # Pipeline ['rest_countries']
 								    # Doc has countries True
 								    # Colombia Bogotá [4.0, -72.0] https://restcountries.eu/data/col.svg
 								    # Czech Republic Prague [49.75, 15.5] https://restcountries.eu/data/cze.svg
 								    # Entities [('Colombia', 'GPE'), ('Czech Republic', 'GPE')]