From a6352403982da4211cb83a80ae8bdee2fc861a7b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 9 Oct 2017 22:03:26 -0500
Subject: [PATCH 1/4] Add conll_ner2json converter

---
 spacy/cli/converters/conll_ner2json.py | 50 ++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 spacy/cli/converters/conll_ner2json.py

diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py
new file mode 100644
index 000000000..e3bd82e7e
--- /dev/null
+++ b/spacy/cli/converters/conll_ner2json.py
@@ -0,0 +1,50 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...compat import json_dumps, path2str
+from ...util import prints
+from ...gold import iob_to_biluo
+
+
+def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
+    """
+    Convert files in the CoNLL-2003 NER format into JSON format for use with train cli.
+    """
+    docs = read_conll_ner(input_path)
+
+    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
+    output_filename = input_path.parts[-1].replace(".conll", "") + ".json"
+    output_file = output_path / output_filename
+    with output_file.open('w', encoding='utf-8') as f:
+        f.write(json_dumps(docs))
+    prints("Created %d documents" % len(docs),
+           title="Generated output file %s" % path2str(output_file))
+
+
+def read_conll_ner(input_path):
+    text = input_path.open('r', encoding='utf-8').read()
+    i = 0
+    delimit_docs = '-DOCSTART- -X- O O'
+    output_docs = []
+    for doc in text.strip().split(delimit_docs):
+        doc = doc.strip()
+        if not doc:
+            continue
+        output_doc = []
+        for sent in doc.split('\n\n'):
+            sent = sent.strip()
+            if not sent:
+                continue
+            lines = [line.strip() for line in sent.split('\n') if line.strip()]
+            words, tags, chunks, iob_ents = zip(*[line.split() for line in lines])
+            biluo_ents = iob_to_biluo(iob_ents)
+            output_doc.append({'tokens': [
+                {'orth': w, 'tag': tag, 'ner': ent} for (w, tag, ent) in
+                zip(words, tags, biluo_ents)
+            ]})
+        output_docs.append({
+            'id': len(output_docs),
+            'paragraphs': [{'sentences': output_doc}]
+        })
+        output_doc = []
+    return output_docs

From 97c9b5db8b6219d53967a136fa9fdd63bd06fca5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 9 Oct 2017 23:41:16 -0500
Subject: [PATCH 2/4] Patch spacy.train for new pipeline management

---
 spacy/cli/train.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index b605f4e61..35ce4c43b 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -88,9 +88,11 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
     n_train_words = corpus.count_train()
 
     lang_class = util.get_lang_class(lang)
-    nlp = lang_class(pipeline=pipeline)
+    nlp = lang_class()
     if vectors:
         util.load_model(vectors, vocab=nlp.vocab)
+    for name in pipeline:
+        nlp.add_pipe(nlp.create_pipe(name), name=name)
     optimizer = nlp.begin_training(lambda: corpus.train_tuples, device=use_gpu)
     nlp._optimizer = None
 
@@ -113,6 +115,8 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                 epoch_model_path = output_path / ('model%d' % i)
                 nlp.to_disk(epoch_model_path)
                 nlp_loaded = lang_class(pipeline=pipeline)
+                for name in pipeline:
+                    nlp_loaded.add_pipe(nlp.create_pipe(name), name=name)
                 nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                 dev_docs = list(corpus.dev_docs(
                                 nlp_loaded,
@@ -128,6 +132,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                     gpu_wps = nwords/(end_time-start_time)
                     with Model.use_device('cpu'):
                         nlp_loaded = lang_class(pipeline=pipeline)
+                        for name in pipeline:
+                            nlp_loaded.add_pipe(nlp.create_pipe(name), name=name)
+
                         nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
                         dev_docs = list(corpus.dev_docs(
                                         nlp_loaded, gold_preproc=gold_preproc))

From 8143618497399543cbceb8c895cc071961094d43 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 10 Oct 2017 19:32:54 +0200
Subject: [PATCH 3/4] Set prefix length back to 1

---
 spacy/lang/lex_attrs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py
index 63695d8a1..d4beebd26 100644
--- a/spacy/lang/lex_attrs.py
+++ b/spacy/lang/lex_attrs.py
@@ -126,7 +126,7 @@ def word_shape(text):
 LEX_ATTRS = {
     attrs.LOWER: lambda string: string.lower(),
     attrs.NORM: lambda string: string.lower(),
-    attrs.PREFIX: lambda string: string[:3],
+    attrs.PREFIX: lambda string: string[0],
     attrs.SUFFIX: lambda string: string[-3:],
     attrs.CLUSTER: lambda string: 0,
     attrs.IS_ALPHA: lambda string: string.isalpha(),

From 5156074df17ee361e1d1444d48118886012b9911 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 10 Oct 2017 12:51:20 -0500
Subject: [PATCH 4/4] Make loading code more consistent in train command

---
 spacy/cli/train.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 35ce4c43b..05d035769 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -114,10 +114,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                 util.set_env_log(False)
                 epoch_model_path = output_path / ('model%d' % i)
                 nlp.to_disk(epoch_model_path)
-                nlp_loaded = lang_class(pipeline=pipeline)
-                for name in pipeline:
-                    nlp_loaded.add_pipe(nlp.create_pipe(name), name=name)
-                nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
+                nlp_loaded = util.load_model_from_path(epoch_model_path)
                 dev_docs = list(corpus.dev_docs(
                                 nlp_loaded,
                                 gold_preproc=gold_preproc))
@@ -131,11 +128,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0,
                 else:
                     gpu_wps = nwords/(end_time-start_time)
                     with Model.use_device('cpu'):
-                        nlp_loaded = lang_class(pipeline=pipeline)
-                        for name in pipeline:
-                            nlp_loaded.add_pipe(nlp.create_pipe(name), name=name)
-
-                        nlp_loaded = nlp_loaded.from_disk(epoch_model_path)
+                        nlp_loaded = util.load_model_from_path(epoch_model_path)
                         dev_docs = list(corpus.dev_docs(
                                         nlp_loaded, gold_preproc=gold_preproc))
                         start_time = timer()