From e237472cdcc32276d042ce56ed0ce3cef560b37e Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 21:25:33 +0100
Subject: [PATCH 1/4] Fix tag and filename conversion for conllu

---
 spacy/cli/converters/conllu2json.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 4d3fb58e4..4dc789010 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -28,7 +28,7 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
             sentences = []
 
     output_filename = input_path.parts[-1].replace(".conllu", ".json")
-    output_filename = input_path.parts[-1].replace(".conll", ".json")
+    output_filename = output_filename.parts[-1].replace(".conll", ".json")
     output_file = output_path / output_filename
     with output_file.open('w', encoding='utf-8') as f:
         f.write(json_dumps(docs))
@@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
                     id_ = int(id_) - 1
                     head = (int(head) - 1) if head != '0' else id_
                     dep = 'ROOT' if dep == 'root' else dep
+                    tag = pos if tag == '_' else tag
                     tag = tag+'__'+morph  if use_morphology else tag
                     tokens.append((id_, word, tag, head, dep, 'O'))
                 except:

From eca41f0cf6c8773813fc7e73096881d3eef38850 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 21:26:49 +0100
Subject: [PATCH 2/4] Fix filename conversion for conllu

---
 spacy/cli/converters/conllu2json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 4dc789010..854b4f204 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
             docs.append(doc)
             sentences = []
 
+    output_filename = input_path.parts[-1].replace(".conll", ".json")
     output_filename = input_path.parts[-1].replace(".conllu", ".json")
-    output_filename = output_filename.parts[-1].replace(".conll", ".json")
     output_file = output_path / output_filename
     with output_file.open('w', encoding='utf-8') as f:
         f.write(json_dumps(docs))

From e033162a1df3d843619d7ee93543a3cd6bb301e4 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 21:49:08 +0100
Subject: [PATCH 3/4] Update tagger training example

---
 examples/training/train_tagger.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py
index 95b9efcbf..161f7910c 100644
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@@ -18,7 +18,6 @@ import random
 from pathlib import Path
 
 import spacy
-from spacy.util import get_lang_class
 from spacy.tokens import Doc
 from spacy.gold import GoldParse
 
@@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25):
     train the tagger with a custom tag map, we're creating a new Language
     instance with a custom vocab.
     """
-    lang_cls = get_lang_class(lang)  # get Language class
-    lang_cls.Defaults.tag_map.update(TAG_MAP)  # add tag map to defaults
-    nlp = lang_cls()  # initialise Language class
-
+    nlp = spacy.blank(lang)
     # add the tagger to the pipeline
     # nlp.create_pipe works for built-ins that are registered with spaCy
     tagger = nlp.create_pipe('tagger')
+    # Add the tags. This needs to be done before you start training.
+    for tag, values in TAG_MAP.items():
+        tagger.add_label(tag, values)
     nlp.add_pipe(tagger)
 
     optimizer = nlp.begin_training()

From b30dd361798ab7aa764fa2f75153f4367e4b17fb Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Wed, 1 Nov 2017 21:49:24 +0100
Subject: [PATCH 4/4] Allow Tagger.add_label() before training

---
 spacy/pipeline.pyx | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index be6804c93..40014ce03 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -431,18 +431,31 @@ class Tagger(Pipe):
     def Model(cls, n_tags, **cfg):
         return build_tagger_model(n_tags, **cfg)
 
-    def add_label(self, label):
+    def add_label(self, label, values=None):
         if label in self.labels:
             return 0
-        raise NotImplementedError
-        #if self.model not in (True, False, None):
-        #    smaller = self.model._layers[-1]
-        #    larger = Softmax(len(self.labels)+1, smaller.nI)
-        #    copy_array(larger.W[:smaller.nO], smaller.W)
-        #    copy_array(larger.b[:smaller.nO], smaller.b)
-        #    self.model._layers[-1] = larger
-        #self.labels.append(label)
-        #return 1
+        if self.model not in (True, False, None):
+            # Here's how the model resizing will work, once the
+            # neuron-to-tag mapping is no longer controlled by
+            # the Morphology class, which sorts the tag names.
+            # The sorting makes adding labels difficult.
+            # smaller = self.model._layers[-1]
+            # larger = Softmax(len(self.labels)+1, smaller.nI)
+            # copy_array(larger.W[:smaller.nO], smaller.W)
+            # copy_array(larger.b[:smaller.nO], smaller.b)
+            # self.model._layers[-1] = larger
+            raise ValueError(
+                "Resizing pre-trained Tagger models is not "
+                "currently supported.")
+        tag_map = dict(self.vocab.morphology.tag_map)
+        if values is None:
+            values = {POS: "X"}
+        tag_map[label] = values
+        self.vocab.morphology = Morphology(
+            self.vocab.strings, tag_map=tag_map,
+            lemmatizer=self.vocab.morphology.lemmatizer,
+            exc=self.vocab.morphology.exc)
+        return 1
 
     def use_params(self, params):
         with self.model.use_params(params):