Add test for Issue #999

2017-04-23 17:06:30 +02:00 · 2017-04-23 17:06:30 +02:00 · 5d8af40445
parent 4d2a659c52
commit 5d8af40445
1 changed files with 75 additions and 0 deletions
--- a/spacy/tests/regression/test_issue999.py
+++ b/spacy/tests/regression/test_issue999.py
@ -0,0 +1,75 @@
+from __future__ import unicode_literals
+import json
+import os
+import random
+import contextlib
+import shutil
+import pytest
+import tempfile
+from pathlib import Path
+
+
+import pathlib
+from ...gold import GoldParse
+from ...pipeline import EntityRecognizer
+from ...en import English
+
+try:
+    unicode
+except NameError:
+    unicode = str
+
+
+@pytest.fixture
+def train_data():
+    return [
+            ["hey",[]],
+            ["howdy",[]],
+            ["hey there",[]],
+            ["hello",[]],
+            ["hi",[]],
+            ["i'm looking for a place to eat",[]],
+            ["i'm looking for a place in the north of town",[[31,36,"location"]]],
+            ["show me chinese restaurants",[[8,15,"cuisine"]]],
+            ["show me chines restaurants",[[8,14,"cuisine"]]],
+    ]
+
+
+@contextlib.contextmanager
+def temp_save_model(model):
+    model_dir = Path(tempfile.mkdtemp())
+    model.save_to_directory(model_dir)
+    yield model_dir
+    shutil.rmtree(model_dir.as_posix())
+
+
+def test_issue999(train_data):
+    '''Test that adding entities and resuming training works passably OK.
+    There are two issues here:
+
+    1) We have to readd labels. This isn't very nice.
+    2) There's no way to set the learning rate for the weight update, so we
+        end up out-of-scale, causing it to learn too fast.
+    '''
+    nlp = English(entity=False)
+    nlp.entity = EntityRecognizer(nlp.vocab, features=English.Defaults.entity_features)
+    for _, offsets in train_data:
+        for start, end, ent_type in offsets:
+            nlp.entity.add_label(ent_type)
+    for itn in range(10):
+        random.shuffle(train_data)
+        for raw_text, entity_offsets in train_data:
+            doc = nlp.make_doc(raw_text)
+            gold = GoldParse(doc, entities=entity_offsets)
+            loss = nlp.entity.update(doc, gold)
+
+    with temp_save_model(nlp) as model_dir:
+        nlp2 = English(path=model_dir)
+
+    for raw_text, entity_offsets in train_data:
+        doc = nlp2(raw_text)
+        ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents}
+        for start, end, label in entity_offsets:
+            if (start, end) not in ents:
+                print(ents)
+            assert ents[(start, end)] == label