spaCy/examples/training/train_intent_parser.py

#!/usr/bin/env python
# coding: utf-8
"""Using the parser to recognise your own semantics

spaCy's parser component can be used to trained to predict any type of tree
structure over your input text. You can also predict trees over whole documents
or chat logs, with connections between the sentence-roots used to annotate
discourse structure. In this example, we'll build a message parser for a common
"chat intent": finding local businesses. Our message semantics will have the
following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.

"show me the best hotel in berlin"
('show', 'ROOT', 'show')
('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
('hotel', 'PLACE', 'show') --> show PLACE hotel
('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin

Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals, print_function

import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding


# training data: texts, heads and dependency labels
# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
TRAIN_DATA = [
    ("find a cafe with great wifi", {
        'heads': [0, 2, 0, 5, 5, 2],  # index of token head
        'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
    }),
    ("find a hotel near the beach", {
        'heads': [0, 2, 0, 5, 5, 2],
        'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
    }),
    ("find me the closest gym that's open late", {
        'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
    }),
    ("show me the cheapest store that sells flowers", {
        'heads': [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
    }),
    ("find a nice restaurant in london", {
        'heads': [0, 3, 3, 0, 3, 3],
        'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
    }),
    ("show me the coolest hostel in berlin", {
        'heads': [0, 0, 4, 4, 0, 4, 4],
        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
    }),
    ("find a good italian restaurant near work", {
        'heads': [0, 4, 4, 4, 0, 4, 5],
        'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
    })
]


@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    parser = nlp.create_pipe('parser')
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get('deps', []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print('Losses', losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)


def test_model(nlp):
    texts = ["find a hotel with good wifi",
             "find me the cheapest gym near work",
             "show me the best hotel in berlin"]
    docs = nlp.pipe(texts)
    for doc in docs:
        print(doc.text)
        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])


if __name__ == '__main__':
    plac.call(main)

    # Expected output:
    # find a hotel with good wifi
    # [
    #   ('find', 'ROOT', 'find'),
    #   ('hotel', 'PLACE', 'find'),
    #   ('good', 'QUALITY', 'wifi'),
    #   ('wifi', 'ATTRIBUTE', 'hotel')
    # ]
    # find me the cheapest gym near work
    # [
    #   ('find', 'ROOT', 'find'),
    #   ('cheapest', 'QUALITY', 'gym'),
    #   ('gym', 'PLACE', 'find'),
    #   ('near', 'ATTRIBUTE', 'gym'),
    #   ('work', 'LOCATION', 'near')
    # ]
    # show me the best hotel in berlin
    # [
    #   ('show', 'ROOT', 'show'),
    #   ('best', 'QUALITY', 'hotel'),
    #   ('hotel', 'PLACE', 'show'),
    #   ('berlin', 'LOCATION', 'hotel')
    # ]
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								#!/usr/bin/env python
 								# coding: utf-8
-												Update intent parser docs and add to usage docs

											
										
										
											2017-10-27 02:49:05 +00:00
+								"""Using the parser to recognise your own semantics
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
-												Update intent parser docs and add to usage docs

											
										
										
											2017-10-27 02:49:05 +00:00
+								spaCy's parser component can be used to trained to predict any type of tree
 								structure over your input text. You can also predict trees over whole documents
 								or chat logs, with connections between the sentence-roots used to annotate
 								discourse structure. In this example, we'll build a message parser for a common
 								"chat intent": finding local businesses. Our message semantics will have the
 								following types of relations: ROOT, PLACE, QUALITY, ATTRIBUTE, TIME, LOCATION.
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
 								"show me the best hotel in berlin"
 								('show', 'ROOT', 'show')
 								('best', 'QUALITY', 'hotel') --> hotel with QUALITY best
 								('hotel', 'PLACE', 'show') --> show PLACE hotel
 								('berlin', 'LOCATION', 'hotel') --> hotel with LOCATION berlin
-												Update training examples to use "simple style"

											
										
										
											2017-11-06 22:14:04 +00:00
-												Update examples

											
										
										
											2017-11-07 00:22:30 +00:00
+								Compatible with: spaCy v2.0.0+
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								"""
 								from __future__ import unicode_literals, print_function
 								import plac
 								import random
 								from pathlib import Path
-												💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-10-09 23:40:29 +00:00
+								import spacy
 								from spacy.util import minibatch, compounding
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
-												Update training examples to use "simple style"

											
										
										
											2017-11-06 22:14:04 +00:00
+								# training data: texts, heads and dependency labels
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								# for no relation, we simply chose an arbitrary dependency label, e.g. '-'
 								TRAIN_DATA = [
-												Update training examples to use "simple style"

											
										
										
											2017-11-06 22:14:04 +00:00
+								    ("find a cafe with great wifi", {
 								        'heads': [0, 2, 0, 5, 5, 2],  # index of token head
 								        'deps': ['ROOT', '-', 'PLACE', '-', 'QUALITY', 'ATTRIBUTE']
 								    }),
 								    ("find a hotel near the beach", {
 								        'heads': [0, 2, 0, 5, 5, 2],
 								        'deps': ['ROOT', '-', 'PLACE', 'QUALITY', '-', 'ATTRIBUTE']
 								    }),
 								    ("find me the closest gym that's open late", {
 								        'heads': [0, 0, 4, 4, 0, 6, 4, 6, 6],
 								        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'ATTRIBUTE', 'TIME']
 								    }),
 								    ("show me the cheapest store that sells flowers", {
 								        'heads': [0, 0, 4, 4, 0, 4, 4, 4],  # attach "flowers" to store!
 								        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', '-', 'PRODUCT']
 								    }),
 								    ("find a nice restaurant in london", {
 								        'heads': [0, 3, 3, 0, 3, 3],
 								        'deps': ['ROOT', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
 								    }),
 								    ("show me the coolest hostel in berlin", {
 								        'heads': [0, 0, 4, 4, 0, 4, 4],
 								        'deps': ['ROOT', '-', '-', 'QUALITY', 'PLACE', '-', 'LOCATION']
 								    }),
 								    ("find a good italian restaurant near work", {
 								        'heads': [0, 4, 4, 4, 0, 4, 5],
 								        'deps': ['ROOT', '-', 'QUALITY', 'ATTRIBUTE', 'PLACE', 'ATTRIBUTE', 'LOCATION']
 								    })
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								]
 								@plac.annotations(
 								    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
 								    output_dir=("Optional output directory", "option", "o", Path),
 								    n_iter=("Number of training iterations", "option", "n", int))
-												💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-10-09 23:40:29 +00:00
+								def main(model=None, output_dir=None, n_iter=15):
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								    """Load the model, set up the pipeline and train the parser."""
 								    if model is not None:
 								        nlp = spacy.load(model)  # load existing spaCy model
 								        print("Loaded model '%s'" % model)
 								    else:
 								        nlp = spacy.blank('en')  # create blank Language class
 								        print("Created blank 'en' model")
-												Don't rename component in intent parser example (resolves #1551)

Otherwise, the default saved model won't know that it's supposed to create spaCy's 'parser'.

											
										
										
											2017-11-10 22:35:38 +00:00
+								    # We'll use the built-in dependency parser class, but we want to create a
 								    # fresh instance – just in case.
-												Update intent parser example

											
										
										
											2017-11-06 22:31:11 +00:00
+								    if 'parser' in nlp.pipe_names:
 								        nlp.remove_pipe('parser')
 								    parser = nlp.create_pipe('parser')
-												Don't rename component in intent parser example (resolves #1551)

Otherwise, the default saved model won't know that it's supposed to create spaCy's 'parser'.

											
										
										
											2017-11-10 22:35:38 +00:00
+								    nlp.add_pipe(parser, first=True)
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
-												Update training examples to use "simple style"

											
										
										
											2017-11-06 22:14:04 +00:00
+								    for text, annotations in TRAIN_DATA:
 								        for dep in annotations.get('deps', []):
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								            parser.add_label(dep)
-												Don't rename component in intent parser example (resolves #1551)

Otherwise, the default saved model won't know that it's supposed to create spaCy's 'parser'.

											
										
										
											2017-11-10 22:35:38 +00:00
+								    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								    with nlp.disable_pipes(*other_pipes):  # only train parser
-												Fix begin_training if get_gold_tuples is None

											
										
										
											2017-11-01 12:14:31 +00:00
+								        optimizer = nlp.begin_training()
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								        for itn in range(n_iter):
 								            random.shuffle(TRAIN_DATA)
 								            losses = {}
-												💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-10-09 23:40:29 +00:00
+								            # batch up the examples using spaCy's minibatch
 								            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
 								            for batch in batches:
 								                texts, annotations = zip(*batch)
 								                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
 								            print('Losses', losses)
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
 								    # test the trained model
 								    test_model(nlp)
 								    # save model to output directory
 								    if output_dir is not None:
 								        output_dir = Path(output_dir)
 								        if not output_dir.exists():
 								            output_dir.mkdir()
 								        nlp.to_disk(output_dir)
 								        print("Saved model to", output_dir)
 								        # test the saved model
 								        print("Loading from", output_dir)
 								        nlp2 = spacy.load(output_dir)
 								        test_model(nlp2)
 								def test_model(nlp):
 								    texts = ["find a hotel with good wifi",
 								             "find me the cheapest gym near work",
 								             "show me the best hotel in berlin"]
 								    docs = nlp.pipe(texts)
 								    for doc in docs:
 								        print(doc.text)
 								        print([(t.text, t.dep_, t.head.text) for t in doc if t.dep_ != '-'])
 								if __name__ == '__main__':
 								    plac.call(main)
 								    # Expected output:
 								    # find a hotel with good wifi
 								    # [
 								    #   ('find', 'ROOT', 'find'),
 								    #   ('hotel', 'PLACE', 'find'),
 								    #   ('good', 'QUALITY', 'wifi'),
 								    #   ('wifi', 'ATTRIBUTE', 'hotel')
 								    # ]
 								    # find me the cheapest gym near work
 								    # [
 								    #   ('find', 'ROOT', 'find'),
 								    #   ('cheapest', 'QUALITY', 'gym'),
-												💫 Update training examples and use minibatching (#2830)

<!--- Provide a general summary of your changes in the title. -->

## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.

### Types of change
enhancements

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-10-09 23:40:29 +00:00
+								    #   ('gym', 'PLACE', 'find'),
 								    #   ('near', 'ATTRIBUTE', 'gym'),
-												Update training examples to use "simple style"

											
										
										
											2017-11-06 22:14:04 +00:00
+								    #   ('work', 'LOCATION', 'near')
-												Add example for custom intent parser

											
										
										
											2017-10-27 01:55:11 +00:00
+								    # ]
 								    # show me the best hotel in berlin
 								    # [
 								    #   ('show', 'ROOT', 'show'),
 								    #   ('best', 'QUALITY', 'hotel'),
 								    #   ('hotel', 'PLACE', 'show'),
 								    #   ('berlin', 'LOCATION', 'hotel')
 								    # ]