From e237472cdcc32276d042ce56ed0ce3cef560b37e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 1 Nov 2017 21:25:33 +0100 Subject: [PATCH 1/4] Fix tag and filename conversion for conllu --- spacy/cli/converters/conllu2json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 4d3fb58e4..4dc789010 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -28,7 +28,7 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): sentences = [] output_filename = input_path.parts[-1].replace(".conllu", ".json") - output_filename = input_path.parts[-1].replace(".conll", ".json") + output_filename = output_filename.parts[-1].replace(".conll", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: f.write(json_dumps(docs)) @@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0): id_ = int(id_) - 1 head = (int(head) - 1) if head != '0' else id_ dep = 'ROOT' if dep == 'root' else dep + tag = pos if tag == '_' else tag tag = tag+'__'+morph if use_morphology else tag tokens.append((id_, word, tag, head, dep, 'O')) except: From eca41f0cf6c8773813fc7e73096881d3eef38850 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 1 Nov 2017 21:26:49 +0100 Subject: [PATCH 2/4] Fix filename conversion for conllu --- spacy/cli/converters/conllu2json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 4dc789010..854b4f204 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): docs.append(doc) sentences = [] + output_filename = input_path.parts[-1].replace(".conll", ".json") output_filename = input_path.parts[-1].replace(".conllu", ".json") - output_filename = output_filename.parts[-1].replace(".conll", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: f.write(json_dumps(docs)) From e033162a1df3d843619d7ee93543a3cd6bb301e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 1 Nov 2017 21:49:08 +0100 Subject: [PATCH 3/4] Update tagger training example --- examples/training/train_tagger.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 95b9efcbf..161f7910c 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -18,7 +18,6 @@ import random from pathlib import Path import spacy -from spacy.util import get_lang_class from spacy.tokens import Doc from spacy.gold import GoldParse @@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25): train the tagger with a custom tag map, we're creating a new Language instance with a custom vocab. """ - lang_cls = get_lang_class(lang) # get Language class - lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults - nlp = lang_cls() # initialise Language class - + nlp = spacy.blank(lang) # add the tagger to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy tagger = nlp.create_pipe('tagger') + # Add the tags. This needs to be done before you start training. + for tag, values in TAG_MAP.items(): + tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() From b30dd361798ab7aa764fa2f75153f4367e4b17fb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 1 Nov 2017 21:49:24 +0100 Subject: [PATCH 4/4] Allow Tagger.add_label() before training --- spacy/pipeline.pyx | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index be6804c93..40014ce03 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -431,18 +431,31 @@ class Tagger(Pipe): def Model(cls, n_tags, **cfg): return build_tagger_model(n_tags, **cfg) - def add_label(self, label): + def add_label(self, label, values=None): if label in self.labels: return 0 - raise NotImplementedError - #if self.model not in (True, False, None): - # smaller = self.model._layers[-1] - # larger = Softmax(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger - #self.labels.append(label) - #return 1 + if self.model not in (True, False, None): + # Here's how the model resizing will work, once the + # neuron-to-tag mapping is no longer controlled by + # the Morphology class, which sorts the tag names. + # The sorting makes adding labels difficult. + # smaller = self.model._layers[-1] + # larger = Softmax(len(self.labels)+1, smaller.nI) + # copy_array(larger.W[:smaller.nO], smaller.W) + # copy_array(larger.b[:smaller.nO], smaller.b) + # self.model._layers[-1] = larger + raise ValueError( + "Resizing pre-trained Tagger models is not " + "currently supported.") + tag_map = dict(self.vocab.morphology.tag_map) + if values is None: + values = {POS: "X"} + tag_map[label] = values + self.vocab.morphology = Morphology( + self.vocab.strings, tag_map=tag_map, + lemmatizer=self.vocab.morphology.lemmatizer, + exc=self.vocab.morphology.exc) + return 1 def use_params(self, params): with self.model.use_params(params):