diff --git a/examples/training/train_tagger.py b/examples/training/train_tagger.py index 95b9efcbf..161f7910c 100644 --- a/examples/training/train_tagger.py +++ b/examples/training/train_tagger.py @@ -18,7 +18,6 @@ import random from pathlib import Path import spacy -from spacy.util import get_lang_class from spacy.tokens import Doc from spacy.gold import GoldParse @@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25): train the tagger with a custom tag map, we're creating a new Language instance with a custom vocab. """ - lang_cls = get_lang_class(lang) # get Language class - lang_cls.Defaults.tag_map.update(TAG_MAP) # add tag map to defaults - nlp = lang_cls() # initialise Language class - + nlp = spacy.blank(lang) # add the tagger to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy tagger = nlp.create_pipe('tagger') + # Add the tags. This needs to be done before you start training. + for tag, values in TAG_MAP.items(): + tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 4d3fb58e4..854b4f204 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): docs.append(doc) sentences = [] - output_filename = input_path.parts[-1].replace(".conllu", ".json") output_filename = input_path.parts[-1].replace(".conll", ".json") + output_filename = input_path.parts[-1].replace(".conllu", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: f.write(json_dumps(docs)) @@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0): id_ = int(id_) - 1 head = (int(head) - 1) if head != '0' else id_ dep = 'ROOT' if dep == 'root' else dep + tag = pos if tag == '_' else tag tag = tag+'__'+morph if use_morphology else tag tokens.append((id_, word, tag, head, dep, 'O')) except: diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index be6804c93..40014ce03 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -431,18 +431,31 @@ class Tagger(Pipe): def Model(cls, n_tags, **cfg): return build_tagger_model(n_tags, **cfg) - def add_label(self, label): + def add_label(self, label, values=None): if label in self.labels: return 0 - raise NotImplementedError - #if self.model not in (True, False, None): - # smaller = self.model._layers[-1] - # larger = Softmax(len(self.labels)+1, smaller.nI) - # copy_array(larger.W[:smaller.nO], smaller.W) - # copy_array(larger.b[:smaller.nO], smaller.b) - # self.model._layers[-1] = larger - #self.labels.append(label) - #return 1 + if self.model not in (True, False, None): + # Here's how the model resizing will work, once the + # neuron-to-tag mapping is no longer controlled by + # the Morphology class, which sorts the tag names. + # The sorting makes adding labels difficult. + # smaller = self.model._layers[-1] + # larger = Softmax(len(self.labels)+1, smaller.nI) + # copy_array(larger.W[:smaller.nO], smaller.W) + # copy_array(larger.b[:smaller.nO], smaller.b) + # self.model._layers[-1] = larger + raise ValueError( + "Resizing pre-trained Tagger models is not " + "currently supported.") + tag_map = dict(self.vocab.morphology.tag_map) + if values is None: + values = {POS: "X"} + tag_map[label] = values + self.vocab.morphology = Morphology( + self.vocab.strings, tag_map=tag_map, + lemmatizer=self.vocab.morphology.lemmatizer, + exc=self.vocab.morphology.exc) + return 1 def use_params(self, params): with self.model.use_params(params):