Implement new Language methods and pipeline API

2017-10-07 00:25:54 +02:00 · 2017-10-07 00:25:54 +02:00 · 212c8f0711
parent 3468d535ad
commit 212c8f0711
2 changed files with 150 additions and 116 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -70,59 +70,7 @@ class BaseDefaults(object):
                         prefix_search=prefix_search, suffix_search=suffix_search,
                         infix_finditer=infix_finditer, token_match=token_match)

-    @classmethod
-    def create_tagger(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralTagger(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralTagger(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_parser(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralDependencyParser(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_entity(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralEntityRecognizer(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_pipeline(cls, nlp=None, disable=tuple()):
-        meta = nlp.meta if nlp is not None else {}
-        # Resolve strings, like "cnn", "lstm", etc
-        pipeline = []
-        for entry in meta.get('pipeline', []):
-            if entry in disable or getattr(entry, 'name', entry) in disable:
-                continue
-            factory = cls.Defaults.factories[entry]
-            pipeline.append(factory(nlp, **meta.get(entry, {})))
-        return pipeline
-
-    factories = {
-        'make_doc': create_tokenizer,
-        'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
-        'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
-        'parser': lambda nlp, **cfg: [
-            NeuralDependencyParser(nlp.vocab, **cfg),
-            nonproj.deprojectivize],
-        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
-        'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
-        # Temporary compatibility -- delete after pivot
-        'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
-        'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
-        'dependencies': lambda nlp, **cfg: [
-            NeuralDependencyParser(nlp.vocab, **cfg),
-            nonproj.deprojectivize,
-        ],
-        'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-    }
-
+    pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
    token_match = TOKEN_MATCH
    prefixes = tuple(TOKENIZER_PREFIXES)
    suffixes = tuple(TOKENIZER_SUFFIXES)
@ -152,8 +100,17 @@ class Language(object):
    Defaults = BaseDefaults
    lang = None

-    def __init__(self, vocab=True, make_doc=True, pipeline=None,
-                 meta={}, disable=tuple(), **kwargs):
+    factories = {
+        'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
+        'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
+        'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
+        'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),  # nonproj.deprojectivize,
+        'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
+        'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
+        'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
+    }
+
+    def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
        """Initialise a Language object.

        vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -179,28 +136,7 @@ class Language(object):
            factory = self.Defaults.create_tokenizer
            make_doc = factory(self, **meta.get('tokenizer', {}))
        self.tokenizer = make_doc
-        if pipeline is True:
-            self.pipeline = self.Defaults.create_pipeline(self, disable)
-        elif pipeline:
-            # Careful not to do getattr(p, 'name', None) here
-            # If we had disable=[None], we'd disable everything!
-            self.pipeline = [p for p in pipeline
-                             if p not in disable
-                             and getattr(p, 'name', p) not in disable]
-            # Resolve strings, like "cnn", "lstm", etc
-            for i, entry in enumerate(self.pipeline):
-                if entry in self.Defaults.factories:
-                    factory = self.Defaults.factories[entry]
-                    self.pipeline[i] = factory(self, **meta.get(entry, {}))
-        else:
-            self.pipeline = []
-        flat_list = []
-        for pipe in self.pipeline:
-            if isinstance(pipe, list):
-                flat_list.extend(pipe)
-            else:
-                flat_list.append(pipe)
-        self.pipeline = flat_list
+        self.pipeline = []
        self._optimizer = None

    @property
@ -214,11 +150,7 @@ class Language(object):
        self._meta.setdefault('email', '')
        self._meta.setdefault('url', '')
        self._meta.setdefault('license', '')
-        pipeline = []
-        for component in self.pipeline:
-            if hasattr(component, 'name'):
-                pipeline.append(component.name)
-        self._meta['pipeline'] = pipeline
+        self._meta['pipeline'] = self.pipe_names
        return self._meta

    @meta.setter
@ -228,31 +160,133 @@ class Language(object):
    # Conveniences to access pipeline components
    @property
    def tensorizer(self):
-        return self.get_component('tensorizer')
+        return self.get_pipe('tensorizer')

    @property
    def tagger(self):
-        return self.get_component('tagger')
+        return self.get_pipe('tagger')

    @property
    def parser(self):
-        return self.get_component('parser')
+        return self.get_pipe('parser')

    @property
    def entity(self):
-        return self.get_component('ner')
+        return self.get_pipe('ner')

    @property
    def matcher(self):
-        return self.get_component('matcher')
+        return self.get_pipe('matcher')

-    def get_component(self, name):
-        if self.pipeline in (True, None):
-            return None
-        for proc in self.pipeline:
-            if hasattr(proc, 'name') and proc.name.endswith(name):
-                return proc
-        return None
+    @property
+    def pipe_names(self):
+        """Get names of available pipeline components.
+
+        RETURNS (list): List of component name strings, in order.
+        """
+        return [pipe_name for pipe_name, _ in self.pipeline]
+
+    def get_pipe(self, name):
+        """Get a pipeline component for a given component name.
+
+        name (unicode): Name of pipeline component to get.
+        RETURNS (callable): The pipeline component.
+        """
+        for pipe_name, component in self.pipeline:
+            if pipe_name == name:
+                return component
+        msg = "No component '{}' found in pipeline. Available names: {}"
+        raise KeyError(msg.format(name, self.pipe_names))
+
+    def create_pipe(self, name, config=dict()):
+        """Create a pipeline component from a factory.
+
+        name (unicode): Factory name to look up in `Language.factories`.
+        RETURNS (callable): Pipeline component.
+        """
+        if name not in self.factories:
+            raise KeyError("Can't find factory for '{}'.".format(name))
+        factory = self.factories[name]
+        return factory(self, **config)
+
+    def add_pipe(self, component, name=None, before=None, after=None,
+                 first=None, last=None):
+        """Add a component to the processing pipeline. Valid components are
+        callables that take a `Doc` object, modify it and return it. Only one of
+        before, after, first or last can be set. Default behaviour is "last".
+
+        component (callable): The pipeline component.
+        name (unicode): Name of pipeline component. Overwrites existing
+            component.name attribute if available. If no name is set and
+            the component exposes no name attribute, component.__name__ is
+            used. An error is raised if the name already exists in the pipeline.
+        before (unicode): Component name to insert component directly before.
+        after (unicode): Component name to insert component directly after.
+        first (bool): Insert component first / not first in the pipeline.
+        last (bool): Insert component last / not last in the pipeline.
+
+        EXAMPLE:
+            >>> nlp.add_pipe(component, before='ner')
+            >>> nlp.add_pipe(component, name='custom_name', last=True)
+        """
+        if name is None:
+            name = getattr(component, 'name', component.__name__)
+        if name in self.pipe_names:
+            raise ValueError("'{}' already exists in pipeline.".format(name))
+        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
+            msg = ("Invalid constraints. You can only set one of the "
+                   "following: before, after, first, last.")
+            raise ValueError(msg)
+        pipe = (name, component)
+        if last or not any([first, before, after]):
+            self.pipeline.append(pipe)
+        elif first:
+            self.pipeline.insert(0, pipe)
+        elif before and before in self.pipe_names:
+            self.pipeline.insert(self.pipe_names.index(before), pipe)
+        elif after and after in self.pipe_names:
+            self.pipeline.insert(self.pipe_names.index(after), pipe)
+        else:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            unfound = before or after
+            raise ValueError(msg.format(unfound, self.pipe_names))
+
+    def replace_pipe(self, name, component):
+        """Replace a component in the pipeline.
+
+        name (unicode): Name of the component to replace.
+        component (callable): Pipeline component.
+        """
+        if name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(name, self.pipe_names))
+        self.pipeline[self.pipe_names.index(name)] = (name, component)
+
+    def rename_pipe(self, old_name, new_name):
+        """Rename a pipeline component.
+
+        old_name (unicode): Name of the component to rename.
+        new_name (unicode): New name of the component.
+        """
+        if old_name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(old_name, self.pipe_names))
+        if new_name in self.pipe_names:
+            msg = "'{}' already exists in pipeline. Existing names: {}"
+            raise ValueError(msg.format(new_name, self.pipe_names))
+        i = self.pipe_names.index(old_name)
+        self.pipeline[i] = (new_name, self.pipeline[i][1])
+
+    def remove_pipe(self, name):
+        """Remove a component from the pipeline.
+
+        name (unicode): Name of the component to remove.
+        RETURNS (tuple): A (name, component) tuple of the removed component.
+        """
+        if name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(name, self.pipe_names))
+        return self.pipeline.pop(self.pipe_names.index(name))

    def __call__(self, text, disable=[]):
        """'Apply the pipeline to some text. The text can span multiple sentences,
@ -269,8 +303,7 @@ class Language(object):
            ('An', 'NN')
        """
        doc = self.make_doc(text)
-        for proc in self.pipeline:
-            name = getattr(proc, 'name', None)
+        for name, proc in self.pipeline:
            if name in disable:
                continue
            doc = proc(doc)
@ -308,7 +341,7 @@ class Language(object):
            grads[key] = (W, dW)
        pipes = list(self.pipeline)
        random.shuffle(pipes)
-        for proc in pipes:
+        for name, proc in pipes:
            if not hasattr(proc, 'update'):
                continue
            proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
@ -322,7 +355,7 @@ class Language(object):
        docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
        YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
        """
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
            if hasattr(proc, 'preprocess_gold'):
                docs_golds = proc.preprocess_gold(docs_golds)
        for doc, gold in docs_golds:
@ -371,7 +404,7 @@ class Language(object):
        else:
            device = None
        link_vectors_to_models(self.vocab)
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
            if hasattr(proc, 'begin_training'):
                context = proc.begin_training(get_gold_tuples(),
                                              pipeline=self.pipeline)
@ -393,7 +426,7 @@ class Language(object):
        docs, golds = zip(*docs_golds)
        docs = list(docs)
        golds = list(golds)
-        for pipe in self.pipeline:
+        for name, pipe in self.pipeline:
            if not hasattr(pipe, 'pipe'):
                for doc in docs:
                    pipe(doc)
@ -419,7 +452,7 @@ class Language(object):
            >>> with nlp.use_params(optimizer.averages):
            >>>     nlp.to_disk('/tmp/checkpoint')
        """
-        contexts = [pipe.use_params(params) for pipe
+        contexts = [pipe.use_params(params) for name, pipe
                    in self.pipeline if hasattr(pipe, 'use_params')]
        # TODO: Having trouble with contextlib
        # Workaround: these aren't actually context managers atm.
@ -466,8 +499,7 @@ class Language(object):
                yield (doc, context)
            return
        docs = (self.make_doc(text) for text in texts)
-        for proc in self.pipeline:
-            name = getattr(proc, 'name', None)
+        for name, proc in self.pipeline:
            if name in disable:
                continue
            if hasattr(proc, 'pipe'):
@ -495,14 +527,14 @@ class Language(object):
            ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
        ))
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
            if not hasattr(proc, 'name'):
                continue
-            if proc.name in disable:
+            if name in disable:
                continue
            if not hasattr(proc, 'to_disk'):
                continue
-            serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
+            serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
        serializers['vocab'] = lambda p: self.vocab.to_disk(p)
        util.to_disk(path, serializers, {p: False for p in disable})

@ -526,14 +558,12 @@ class Language(object):
            ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
            ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
        ))
-        for proc in self.pipeline:
-            if not hasattr(proc, 'name'):
-                continue
-            if proc.name in disable:
+        for name, proc in self.pipeline:
+            if name in disable:
                continue
            if not hasattr(proc, 'to_disk'):
                continue
-            deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
+            deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
        exclude = {p: False for p in disable}
        if not (path / 'vocab').exists():
            exclude['vocab'] = True
@ -552,8 +582,8 @@ class Language(object):
            ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
            ('meta', lambda: ujson.dumps(self.meta))
        ))
-        for i, proc in enumerate(self.pipeline):
-            if getattr(proc, 'name', None) in disable:
+        for i, (name, proc) in enumerate(self.pipeline):
+            if name in disable:
                continue
            if not hasattr(proc, 'to_bytes'):
                continue
@ -572,8 +602,8 @@ class Language(object):
            ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
            ('meta', lambda b: self.meta.update(ujson.loads(b)))
        ))
-        for i, proc in enumerate(self.pipeline):
-            if getattr(proc, 'name', None) in disable:
+        for i, (name, proc) in enumerate(self.pipeline):
+            if name in disable:
                continue
            if not hasattr(proc, 'from_bytes'):
                continue
--- a/spacy/util.py
+++ b/spacy/util.py
@ -135,7 +135,11 @@ def load_model_from_path(model_path, meta=False, **overrides):
    if not meta:
        meta = get_model_meta(model_path)
    cls = get_lang_class(meta['lang'])
-    nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
+    nlp = cls(meta=meta, **overrides)
+    for name in meta.get('pipeline', []):
+        config = meta.get('pipeline_args', {}).get(name, {})
+        component = nlp.create_pipe(name, config=config)
+        nlp.add_pipe(component, name=name)
    return nlp.from_disk(model_path)