From 62237755a40efa8f8f5009af4f03d9fe9e4162a2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 13:40:17 +0200
Subject: [PATCH 1/9] Import shutil

---
 spacy/cli/train.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 4cad2cae1..e5157bc68 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -7,6 +7,7 @@ import tqdm
 from thinc.neural._classes.model import Model
 from timeit import default_timer as timer
 import json
+import shutil
 
 from ._messages import Messages
 from ..attrs import PROB, IS_OOV, CLUSTER, LANG

From 24dfbb8a2853205c5fa27ac0657119a8c2699e39 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 14:35:24 +0200
Subject: [PATCH 2/9] Fix model collation

---
 spacy/cli/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index e5157bc68..b4ea41114 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -203,7 +203,7 @@ def _collate_best_model(meta, output_path, components):
     best_dest = output_path / 'model-best'
     shutil.copytree(output_path / 'model-final', best_dest)
     for component, best_component_src in bests.items():
-        shutil.rmtree(best_dir / component)
+        shutil.rmtree(best_dest / component)
         shutil.copytree(best_component_src, best_dest / component)
         with (best_component_src / 'accuracy.json').open() as file_:
             accs = json.load(file_)

From c4698f5712fe4b20a0156ff8e288ab82a4ef475e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 16:36:42 +0200
Subject: [PATCH 3/9] Don't collate model unless training succeeds

---
 spacy/cli/train.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b4ea41114..681b8de61 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -187,14 +187,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
         with nlp.use_params(optimizer.averages):
             final_model_path = output_path / 'model-final'
             nlp.to_disk(final_model_path)
-        components = []
-        if not no_parser:
-            components.append('parser')
-        if not no_tagger:
-            components.append('tagger')
-        if not no_entities:
-            components.append('ner')
-        _collate_best_model(meta, output_path, components)
+    components = []
+    if not no_parser:
+        components.append('parser')
+    if not no_tagger:
+        components.append('tagger')
+    if not no_entities:
+        components.append('ner')
+    _collate_best_model(meta, output_path, components)
+
 
 def _collate_best_model(meta, output_path, components):
     bests = {}

From 664f89327ad03b7db0f2285601c9eae155d7df52 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 17:58:45 +0200
Subject: [PATCH 4/9] Fix init-model if no vectors provided

---
 spacy/cli/init_model.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index 87c3033ad..c83b95782 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -90,11 +90,12 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
             lexeme.cluster = 0
         lex_added += 1
     nlp.vocab.cfg.update({'oov_prob': oov_prob})
-    for word in vector_keys:
-        if word not in nlp.vocab:
-            lexeme = nlp.vocab[word]
-            lexeme.is_oov = False
-            lex_added += 1
+    if vector_keys is not None:
+        for word in vector_keys:
+            if word not in nlp.vocab:
+                lexeme = nlp.vocab[word]
+                lexeme.is_oov = False
+                lex_added += 1
     if len(vectors_data):
         nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
     if prune_vectors >= 1:

From 69c900f0032b03dc00e8393acd57768011d4d379 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 18:26:02 +0200
Subject: [PATCH 5/9] Fix init-model if no vectors provided

---
 spacy/cli/init_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index c83b95782..bad63209e 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -96,7 +96,7 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
                 lexeme = nlp.vocab[word]
                 lexeme.is_oov = False
                 lex_added += 1
-    if len(vectors_data):
+    if vectors_data:
         nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
     if prune_vectors >= 1:
         nlp.vocab.prune_vectors(prune_vectors)

From 3aabf621a3d8bdc55789717b052c69d6ddab1225 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 22:00:51 +0200
Subject: [PATCH 6/9] Fix handling of unknown tags in tagger update

---
 spacy/pipeline.pyx | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 477c9d6e2..7bc25c4b2 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -505,8 +505,10 @@ class Tagger(Pipe):
             for tag in gold.tags:
                 if tag is None:
                     correct[idx] = guesses[idx]
-                else:
+                elif tag in tag_index:
                     correct[idx] = tag_index[tag]
+                else:
+                    correct[idx] = len(tag_index)+1
                 idx += 1
         correct = self.model.ops.xp.array(correct, dtype='i')
         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])

From 5b56aad4c263d04ae64aac5e69a3240c3f040da5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 22:24:54 +0200
Subject: [PATCH 7/9] Fix handling of unseen labels in tagger

---
 spacy/pipeline.pyx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 7bc25c4b2..d647cc8dc 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -501,6 +501,7 @@ class Tagger(Pipe):
         cdef int idx = 0
         correct = numpy.zeros((scores.shape[0],), dtype='i')
         guesses = scores.argmax(axis=1)
+        known_labels = numpy.ones((scores.shape[0],), dtype='f')
         for gold in golds:
             for tag in gold.tags:
                 if tag is None:
@@ -508,10 +509,12 @@ class Tagger(Pipe):
                 elif tag in tag_index:
                     correct[idx] = tag_index[tag]
                 else:
-                    correct[idx] = len(tag_index)+1
+                    correct[idx] = 0
+                    known_labels[idx] = 0.
                 idx += 1
         correct = self.model.ops.xp.array(correct, dtype='i')
         d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
+        d_scores *= known_labels
         loss = (d_scores**2).sum()
         d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
         return float(loss), d_scores

From 5a65418c40979766832917400b99f6455a3df57e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 22:28:59 +0200
Subject: [PATCH 8/9] Fix handling of unseen labels in tagger

---
 spacy/pipeline.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index d647cc8dc..e913b2647 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -501,7 +501,7 @@ class Tagger(Pipe):
         cdef int idx = 0
         correct = numpy.zeros((scores.shape[0],), dtype='i')
         guesses = scores.argmax(axis=1)
-        known_labels = numpy.ones((scores.shape[0],), dtype='f')
+        known_labels = numpy.ones((scores.shape[0], 1), dtype='f')
         for gold in golds:
             for tag in gold.tags:
                 if tag is None:

From c83fccfe2ae6c37f8953c96b70e30129ff39dcc2 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 25 Jun 2018 23:05:56 +0200
Subject: [PATCH 9/9] Fix output of best model

---
 spacy/cli/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 681b8de61..a9c332fe3 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -205,7 +205,7 @@ def _collate_best_model(meta, output_path, components):
     shutil.copytree(output_path / 'model-final', best_dest)
     for component, best_component_src in bests.items():
         shutil.rmtree(best_dest / component)
-        shutil.copytree(best_component_src, best_dest / component)
+        shutil.copytree(best_component_src / component, best_dest / component)
         with (best_component_src / 'accuracy.json').open() as file_:
             accs = json.load(file_)
         for metric in _get_metrics(component):