Update train_new_entity_type example to use disable_pipes

2017-10-25 14:56:53 +02:00 · 2017-10-25 14:56:53 +02:00 · 615c315d70
parent 6a00de4f77
commit 615c315d70
1 changed files with 96 additions and 78 deletions
--- a/examples/training/train_new_entity_type.py
+++ b/examples/training/train_new_entity_type.py
@ -21,26 +21,115 @@ After training your model, you can save it to a directory. We recommend
 wrapping models as Python packages, for ease of deployment.

 For more details, see the documentation:
-* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner
-* Saving and loading models: https://spacy.io/docs/usage/saving-loading
+* Training: https://alpha.spacy.io/usage/training
+* NER: https://alpha.spacy.io/usage/linguistic-features#named-entities

-Developed for: spaCy 1.7.6
-Last updated for: spaCy 2.0.0a13
+Developed for: spaCy 2.0.0a18
+Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function

 import random
 from pathlib import Path
-import random

 import spacy
 from spacy.gold import GoldParse, minibatch
 from spacy.pipeline import NeuralEntityRecognizer
-from spacy.pipeline import TokenVectorEncoder
+
+
+# new entity label
+LABEL = 'ANIMAL'
+
+# training data
+TRAIN_DATA = [
+    ("Horses are too tall and they pretend to care about your feelings",
+     [(0, 6, 'ANIMAL')]),
+
+    ("Do they bite?", []),
+
+    ("horses are too tall and they pretend to care about your feelings",
+     [(0, 6, 'ANIMAL')]),
+
+    ("horses pretend to care about your feelings", [(0, 6, 'ANIMAL')]),
+
+    ("they pretend to care about your feelings, those horses",
+     [(48, 54, 'ANIMAL')]),
+
+    ("horses?", [(0, 6, 'ANIMAL')])
+]
+
+
+def main(model=None, new_model_name='animal', output_dir=None):
+    """Set up the pipeline and entity recognizer, and train the new entity.
+
+    model (unicode): Model name to start off with. If None, a blank English
+        Language class is created.
+    new_model_name (unicode): Name of new model to create. Will be added to the
+        model meta and prefixed by the language code, e.g. 'en_animal'.
+    output_dir (unicode / Path): Optional output directory. If None, no model
+        will be saved.
+    """
+    if model is not None:
+        nlp = spacy.load(model)  # load existing spaCy model
+        print("Loaded model '%s'" % model)
+    else:
+        nlp = spacy.blank('en')  # create blank Language class
+        print("Created blank 'en' model")
+
+    # Add entity recognizer to model if it's not in the pipeline
+    if 'ner' not in nlp.pipe_names:
+        nlp.add_pipe(NeuralEntityRecognizer(nlp.vocab))
+
+    ner = nlp.get_pipe('ner')  # get entity recognizer
+    ner.add_label(LABEL)   # add new entity label to entity recognizer
+
+    # get names of other pipes to disable them during training
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
+    with nlp.disable_pipes(*other_pipes) as disabled:  # only train NER
+        random.seed(0)
+        optimizer = nlp.begin_training(lambda: [])
+        for itn in range(50):
+            losses = {}
+            gold_parses = get_gold_parses(nlp.make_doc, TRAIN_DATA)
+            for batch in minibatch(gold_parses, size=3):
+                docs, golds = zip(*batch)
+                nlp.update(docs, golds, losses=losses, sgd=optimizer,
+                           drop=0.35)
+            print(losses)
+        print(nlp.pipeline)
+        print(disabled.original_pipeline)
+
+    # test the trained model
+    test_text = 'Do you like horses?'
+    doc = nlp(test_text)
+    print("Entities in '%s'" % test_text)
+    for ent in doc.ents:
+        print(ent.label_, ent.text)
+
+    # save model to output directory
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.meta['name'] = new_model_name  # rename model
+        nlp.to_disk(output_dir)
+        print("Saved model to", output_dir)
+
+        # test the saved model
+        print("Loading from", output_dir)
+        nlp2 = spacy.load(output_dir)
+        doc2 = nlp2(test_text)
+        for ent in doc2.ents:
+            print(ent.label_, ent.text)


 def get_gold_parses(tokenizer, train_data):
-    '''Shuffle and create GoldParse objects'''
+    """Shuffle and create GoldParse objects.
+
+    tokenizer (Tokenizer): Tokenizer to processs the raw text.
+    train_data (list): The training data.
+    YIELDS (tuple): (doc, gold) tuples.
+    """
    random.shuffle(train_data)
    for raw_text, entity_offsets in train_data:
        doc = tokenizer(raw_text)
@ -48,77 +137,6 @@ def get_gold_parses(tokenizer, train_data):
        yield doc, gold


-def train_ner(nlp, train_data, output_dir):
-    random.seed(0)
-    optimizer = nlp.begin_training(lambda: [])
-    nlp.meta['name'] = 'en_ent_animal'
-    for itn in range(50):
-        losses = {}
-        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
-            docs, golds = zip(*batch)
-            nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35)
-        print(losses)
-    if not output_dir:
-        return
-    elif not output_dir.exists():
-        output_dir.mkdir()
-    nlp.to_disk(output_dir)
-
-
-def main(model_name, output_directory=None):
-    print("Creating initial model", model_name)
-    nlp = spacy.blank(model_name)
-    if output_directory is not None:
-        output_directory = Path(output_directory)
-
-    train_data = [
-        (
-            "Horses are too tall and they pretend to care about your feelings",
-            [(0, 6, 'ANIMAL')],
-        ),
-        (
-            "Do they bite?", 
-            [],
-        ),
- 
-        (
-            "horses are too tall and they pretend to care about your feelings",
-            [(0, 6, 'ANIMAL')]
-        ),
-        (
-            "horses pretend to care about your feelings",
-            [(0, 6, 'ANIMAL')]
-        ),
-        (
-            "they pretend to care about your feelings, those horses",
-            [(48, 54, 'ANIMAL')]
-        ),
-        (
-            "horses?",
-            [(0, 6, 'ANIMAL')]
-        )
-
-    ]
-    nlp.add_pipe(TokenVectorEncoder(nlp.vocab))
-    ner = NeuralEntityRecognizer(nlp.vocab)
-    ner.add_label('ANIMAL')
-    nlp.add_pipe(ner)
-    train_ner(nlp, train_data, output_directory)
-
-    # Test that the entity is recognized
-    text = 'Do you like horses?'
-    print("Ents in 'Do you like horses?':")
-    doc = nlp(text)
-    for ent in doc.ents:
-        print(ent.label_, ent.text)
-    if output_directory:
-        print("Loading from", output_directory)
-        nlp2 = spacy.load(output_directory)
-        doc2 = nlp2('Do you like horses?')
-        for ent in doc2.ents:
-            print(ent.label_, ent.text)
-
-
 if __name__ == '__main__':
    import plac
    plac.call(main)