Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2017-11-01 21:49:59 +01:00 · 2017-11-01 21:49:59 +01:00 · bc637b2433
parent 1976fb157f b30dd36179
commit bc637b2433
3 changed files with 29 additions and 16 deletions
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -18,7 +18,6 @@ import random
 from pathlib import Path

 import spacy
-from spacy.util import get_lang_class
 from spacy.tokens import Doc
 from spacy.gold import GoldParse

@ -52,13 +51,13 @@ def main(lang='en', output_dir=None, n_iter=25):
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
-    lang_cls = get_lang_class(lang)  # get Language class
-    lang_cls.Defaults.tag_map.update(TAG_MAP)  # add tag map to defaults
-    nlp = lang_cls()  # initialise Language class
-
+    nlp = spacy.blank(lang)
    # add the tagger to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    tagger = nlp.create_pipe('tagger')
+    # Add the tags. This needs to be done before you start training.
+    for tag, values in TAG_MAP.items():
+        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)

    optimizer = nlp.begin_training()
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -27,8 +27,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
            docs.append(doc)
            sentences = []

-    output_filename = input_path.parts[-1].replace(".conllu", ".json")
    output_filename = input_path.parts[-1].replace(".conll", ".json")
+    output_filename = input_path.parts[-1].replace(".conllu", ".json")
    output_file = output_path / output_filename
    with output_file.open('w', encoding='utf-8') as f:
        f.write(json_dumps(docs))
@ -55,6 +55,7 @@ def read_conllx(input_path, use_morphology=False, n=0):
                    id_ = int(id_) - 1
                    head = (int(head) - 1) if head != '0' else id_
                    dep = 'ROOT' if dep == 'root' else dep
+                    tag = pos if tag == '_' else tag
                    tag = tag+'__'+morph  if use_morphology else tag
                    tokens.append((id_, word, tag, head, dep, 'O'))
                except:
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -431,18 +431,31 @@ class Tagger(Pipe):
    def Model(cls, n_tags, **cfg):
        return build_tagger_model(n_tags, **cfg)

-    def add_label(self, label):
+    def add_label(self, label, values=None):
        if label in self.labels:
            return 0
-        raise NotImplementedError
-        #if self.model not in (True, False, None):
+        if self.model not in (True, False, None):
+            # Here's how the model resizing will work, once the
+            # neuron-to-tag mapping is no longer controlled by
+            # the Morphology class, which sorts the tag names.
+            # The sorting makes adding labels difficult.
            # smaller = self.model._layers[-1]
            # larger = Softmax(len(self.labels)+1, smaller.nI)
            # copy_array(larger.W[:smaller.nO], smaller.W)
            # copy_array(larger.b[:smaller.nO], smaller.b)
            # self.model._layers[-1] = larger
-        #self.labels.append(label)
-        #return 1
+            raise ValueError(
+                "Resizing pre-trained Tagger models is not "
+                "currently supported.")
+        tag_map = dict(self.vocab.morphology.tag_map)
+        if values is None:
+            values = {POS: "X"}
+        tag_map[label] = values
+        self.vocab.morphology = Morphology(
+            self.vocab.strings, tag_map=tag_map,
+            lemmatizer=self.vocab.morphology.lemmatizer,
+            exc=self.vocab.morphology.exc)
+        return 1

    def use_params(self, params):
        with self.model.use_params(params):