💫 Break up large pipeline.pyx (#3246)

* Break up large pipeline.pyx * Merge some components back together * Fix typo
2019-02-10 12:14:51 +01:00 · 2019-02-10 12:14:51 +01:00 · a9f8d17632
parent e7593b791e
commit a9f8d17632
7 changed files with 400 additions and 360 deletions
--- a/setup.py
+++ b/setup.py
@ -41,7 +41,7 @@ MOD_NAMES = [
    "spacy.vocab",
    "spacy.attrs",
    "spacy.morphology",
-    "spacy.pipeline",
+    "spacy.pipeline.pipes",
    "spacy.syntax.stateclass",
    "spacy.syntax._state",
    "spacy.tokenizer",
--- a/spacy/pipeline.pxd
+++ b/spacy/pipeline.pxd
--- a/spacy/pipeline/init.py
+++ b/spacy/pipeline/init.py
@ -0,0 +1,8 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .pipes import Tagger, DependencyParser, EntityRecognizer  # noqa
+from .pipes import TextCategorizer, Tensorizer, Pipe  # noqa
+from .entityruler import EntityRuler  # noqa
+from .hooks import SentenceSegmenter, SimilarityHook  # noqa
+from .functions import merge_entities, merge_noun_chunks, merge_subtokens  # noqa
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -0,0 +1,170 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from collections import defaultdict
+import srsly
+
+from ..errors import Errors
+from ..compat import basestring_
+from ..util import ensure_path
+from ..tokens import Span
+from ..matcher import Matcher, PhraseMatcher
+
+
+class EntityRuler(object):
+    name = "entity_ruler"
+
+    def __init__(self, nlp, **cfg):
+        """Initialise the entitiy ruler. If patterns are supplied here, they
+        need to be a list of dictionaries with a `"label"` and `"pattern"`
+        key. A pattern can either be a token pattern (list) or a phrase pattern
+        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
+
+        nlp (Language): The shared nlp object to pass the vocab to the matchers
+            and process phrase patterns.
+        patterns (iterable): Optional patterns to load in.
+        overwrite_ents (bool): If existing entities are present, e.g. entities
+            added by the model, overwrite them by matches if necessary.
+        **cfg: Other config parameters. If pipeline component is loaded as part
+            of a model pipeline, this will include all keyword arguments passed
+            to `spacy.load`.
+        RETURNS (EntityRuler): The newly constructed object.
+        """
+        self.nlp = nlp
+        self.overwrite = cfg.get("overwrite_ents", False)
+        self.token_patterns = defaultdict(list)
+        self.phrase_patterns = defaultdict(list)
+        self.matcher = Matcher(nlp.vocab)
+        self.phrase_matcher = PhraseMatcher(nlp.vocab)
+        patterns = cfg.get("patterns")
+        if patterns is not None:
+            self.add_patterns(patterns)
+
+    def __len__(self):
+        """The number of all patterns added to the entity ruler."""
+        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
+        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
+        return n_token_patterns + n_phrase_patterns
+
+    def __contains__(self, label):
+        """Whether a label is present in the patterns."""
+        return label in self.token_patterns or label in self.phrase_patterns
+
+    def __call__(self, doc):
+        """Find matches in document and add them as entities.
+
+        doc (Doc): The Doc object in the pipeline.
+        RETURNS (Doc): The Doc with added entities, if available.
+        """
+        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
+        matches = set(
+            [(m_id, start, end) for m_id, start, end in matches if start != end]
+        )
+        get_sort_key = lambda m: (m[2] - m[1], m[1])
+        matches = sorted(matches, key=get_sort_key, reverse=True)
+        entities = list(doc.ents)
+        new_entities = []
+        seen_tokens = set()
+        for match_id, start, end in matches:
+            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
+                continue
+            # check for end - 1 here because boundaries are inclusive
+            if start not in seen_tokens and end - 1 not in seen_tokens:
+                new_entities.append(Span(doc, start, end, label=match_id))
+                entities = [
+                    e for e in entities if not (e.start < end and e.end > start)
+                ]
+                seen_tokens.update(range(start, end))
+        doc.ents = entities + new_entities
+        return doc
+
+    @property
+    def labels(self):
+        """All labels present in the match patterns.
+
+        RETURNS (set): The string labels.
+        """
+        all_labels = set(self.token_patterns.keys())
+        all_labels.update(self.phrase_patterns.keys())
+        return all_labels
+
+    @property
+    def patterns(self):
+        """Get all patterns that were added to the entity ruler.
+
+        RETURNS (list): The original patterns, one dictionary per pattern.
+        """
+        all_patterns = []
+        for label, patterns in self.token_patterns.items():
+            for pattern in patterns:
+                all_patterns.append({"label": label, "pattern": pattern})
+        for label, patterns in self.phrase_patterns.items():
+            for pattern in patterns:
+                all_patterns.append({"label": label, "pattern": pattern.text})
+        return all_patterns
+
+    def add_patterns(self, patterns):
+        """Add patterns to the entitiy ruler. A pattern can either be a token
+        pattern (list of dicts) or a phrase pattern (string). For example:
+        {'label': 'ORG', 'pattern': 'Apple'}
+        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
+
+        patterns (list): The patterns to add.
+        """
+        for entry in patterns:
+            label = entry["label"]
+            pattern = entry["pattern"]
+            if isinstance(pattern, basestring_):
+                self.phrase_patterns[label].append(self.nlp(pattern))
+            elif isinstance(pattern, list):
+                self.token_patterns[label].append(pattern)
+            else:
+                raise ValueError(Errors.E097.format(pattern=pattern))
+        for label, patterns in self.token_patterns.items():
+            self.matcher.add(label, None, *patterns)
+        for label, patterns in self.phrase_patterns.items():
+            self.phrase_matcher.add(label, None, *patterns)
+
+    def from_bytes(self, patterns_bytes, **kwargs):
+        """Load the entity ruler from a bytestring.
+
+        patterns_bytes (bytes): The bytestring to load.
+        **kwargs: Other config paramters, mostly for consistency.
+        RETURNS (EntityRuler): The loaded entity ruler.
+        """
+        patterns = srsly.msgpack_loads(patterns_bytes)
+        self.add_patterns(patterns)
+        return self
+
+    def to_bytes(self, **kwargs):
+        """Serialize the entity ruler patterns to a bytestring.
+
+        RETURNS (bytes): The serialized patterns.
+        """
+        return srsly.msgpack_dumps(self.patterns)
+
+    def from_disk(self, path, **kwargs):
+        """Load the entity ruler from a file. Expects a file containing
+        newline-delimited JSON (JSONL) with one entry per line.
+
+        path (unicode / Path): The JSONL file to load.
+        **kwargs: Other config paramters, mostly for consistency.
+        RETURNS (EntityRuler): The loaded entity ruler.
+        """
+        path = ensure_path(path)
+        path = path.with_suffix(".jsonl")
+        patterns = srsly.read_jsonl(path)
+        self.add_patterns(patterns)
+        return self
+
+    def to_disk(self, path, **kwargs):
+        """Save the entity ruler patterns to a directory. The patterns will be
+        saved as newline-delimited JSON (JSONL).
+
+        path (unicode / Path): The JSONL file to load.
+        **kwargs: Other config paramters, mostly for consistency.
+        RETURNS (EntityRuler): The loaded entity ruler.
+        """
+        path = ensure_path(path)
+        path = path.with_suffix(".jsonl")
+        srsly.write_jsonl(path, self.patterns)
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -0,0 +1,48 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..matcher import Matcher
+
+
+# TODO: replace doc.merge with doc.retokenize
+
+
+def merge_noun_chunks(doc):
+    """Merge noun chunks into a single token.
+
+    doc (Doc): The Doc object.
+    RETURNS (Doc): The Doc object with merged noun chunks.
+    """
+    if not doc.is_parsed:
+        return doc
+    spans = [
+        (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
+    ]
+    for start, end, tag, dep in spans:
+        doc.merge(start, end, tag=tag, dep=dep)
+    return doc
+
+
+def merge_entities(doc):
+    """Merge entities into a single token.
+
+    doc (Doc): The Doc object.
+    RETURNS (Doc): The Doc object with merged noun entities.
+    """
+    spans = [
+        (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
+    ]
+    for start, end, tag, dep, ent_type in spans:
+        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
+    return doc
+
+
+def merge_subtokens(doc, label="subtok"):
+    merger = Matcher(doc.vocab)
+    merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
+    matches = merger(doc)
+    spans = [doc[start : end + 1] for _, start, end in matches]
+    offsets = [(span.start_char, span.end_char) for span in spans]
+    for start_char, end_char in offsets:
+        doc.merge(start_char, end_char)
+    return doc
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@ -0,0 +1,100 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from thinc.t2v import Pooling, max_pool, mean_pool
+from thinc.neural._classes.difference import Siamese, CauchySimilarity
+
+from .pipes import Pipe
+from .._ml import link_vectors_to_models
+
+
+class SentenceSegmenter(object):
+    """A simple spaCy hook, to allow custom sentence boundary detection logic
+    (that doesn't require the dependency parse). To change the sentence
+    boundary detection strategy, pass a generator function `strategy` on
+    initialization, or assign a new strategy to the .strategy attribute.
+    Sentence detection strategies should be generators that take `Doc` objects
+    and yield `Span` objects for each sentence.
+    """
+
+    name = "sentencizer"
+
+    def __init__(self, vocab, strategy=None):
+        self.vocab = vocab
+        if strategy is None or strategy == "on_punct":
+            strategy = self.split_on_punct
+        self.strategy = strategy
+
+    def __call__(self, doc):
+        doc.user_hooks["sents"] = self.strategy
+        return doc
+
+    @staticmethod
+    def split_on_punct(doc):
+        start = 0
+        seen_period = False
+        for i, word in enumerate(doc):
+            if seen_period and not word.is_punct:
+                yield doc[start : word.i]
+                start = word.i
+                seen_period = False
+            elif word.text in [".", "!", "?"]:
+                seen_period = True
+        if start < len(doc):
+            yield doc[start : len(doc)]
+
+
+class SimilarityHook(Pipe):
+    """
+    Experimental: A pipeline component to install a hook for supervised
+    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
+    documents. The similarity model can be any object obeying the Thinc `Model`
+    interface. By default, the model concatenates the elementwise mean and
+    elementwise max of the two tensors, and compares them using the
+    Cauchy-like similarity function from Chen (2013):
+
+        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
+
+    Where W is a vector of dimension weights, initialized to 1.
+    """
+
+    name = "similarity"
+
+    def __init__(self, vocab, model=True, **cfg):
+        self.vocab = vocab
+        self.model = model
+        self.cfg = dict(cfg)
+
+    @classmethod
+    def Model(cls, length):
+        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
+
+    def __call__(self, doc):
+        """Install similarity hook"""
+        doc.user_hooks["similarity"] = self.predict
+        return doc
+
+    def pipe(self, docs, **kwargs):
+        for doc in docs:
+            yield self(doc)
+
+    def predict(self, doc1, doc2):
+        self.require_model()
+        return self.model.predict([(doc1, doc2)])
+
+    def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
+        self.require_model()
+        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
+
+    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
+        """Allocate model, using width from tensorizer in pipeline.
+
+        gold_tuples (iterable): Gold-standard training data.
+        pipeline (list): The pipeline the model is part of.
+        """
+        if self.model is True:
+            self.model = self.Model(pipeline[0].model.nO)
+            link_vectors_to_models(self.vocab)
+        if sgd is None:
+            sgd = self.create_optimizer()
+        return sgd
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -3,271 +3,39 @@
 # coding: utf8
 from __future__ import unicode_literals

-import numpy
 cimport numpy as np
-from collections import OrderedDict, defaultdict
+
+import numpy
+from collections import OrderedDict
 import srsly

 from thinc.api import chain
 from thinc.v2v import Affine, Maxout, Softmax
 from thinc.misc import LayerNorm
-from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural.util import to_categorical, copy_array
-from thinc.neural._classes.difference import Siamese, CauchySimilarity

-from .tokens.doc cimport Doc
-from .syntax.nn_parser cimport Parser
-from .syntax import nonproj
-from .syntax.ner cimport BiluoPushDown
-from .syntax.arc_eager cimport ArcEager
-from .morphology cimport Morphology
-from .vocab cimport Vocab
-from .syntax import nonproj
-from .matcher import Matcher
+from ..tokens.doc cimport Doc
+from ..syntax.nn_parser cimport Parser
+from ..syntax.ner cimport BiluoPushDown
+from ..syntax.arc_eager cimport ArcEager
+from ..morphology cimport Morphology
+from ..vocab cimport Vocab

-from .matcher import Matcher, PhraseMatcher
-from .tokens.span import Span
-from .attrs import POS, ID
-from .parts_of_speech import X
-from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
-from ._ml import build_simple_cnn_text_classifier
-from ._ml import link_vectors_to_models, zero_init, flatten
-from ._ml import create_default_optimizer
-from ._ml import masked_language_model
-from .errors import Errors, TempErrors
-from .compat import basestring_
-from . import util
+from ..syntax import nonproj
+from ..attrs import POS, ID
+from ..parts_of_speech import X
+from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
+from .._ml import link_vectors_to_models, zero_init, flatten
+from .._ml import masked_language_model, create_default_optimizer
+from ..errors import Errors, TempErrors
+from .. import util


-class SentenceSegmenter(object):
-    """A simple spaCy hook, to allow custom sentence boundary detection logic
-    (that doesn't require the dependency parse). To change the sentence
-    boundary detection strategy, pass a generator function `strategy` on
-    initialization, or assign a new strategy to the .strategy attribute.
-    Sentence detection strategies should be generators that take `Doc` objects
-    and yield `Span` objects for each sentence.
-    """
-    name = 'sentencizer'
-
-    def __init__(self, vocab, strategy=None):
-        self.vocab = vocab
-        if strategy is None or strategy == 'on_punct':
-            strategy = self.split_on_punct
-        self.strategy = strategy
-
-    def __call__(self, doc):
-        doc.user_hooks['sents'] = self.strategy
-        return doc
-
-    @staticmethod
-    def split_on_punct(doc):
-        start = 0
-        seen_period = False
-        for i, word in enumerate(doc):
-            if seen_period and not word.is_punct:
-                yield doc[start:word.i]
-                start = word.i
-                seen_period = False
-            elif word.text in ['.', '!', '?']:
-                seen_period = True
-        if start < len(doc):
-            yield doc[start:len(doc)]
-
-
-def merge_noun_chunks(doc):
-    """Merge noun chunks into a single token.
-
-    doc (Doc): The Doc object.
-    RETURNS (Doc): The Doc object with merged noun chunks.
-    """
-    if not doc.is_parsed:
-        return doc
-    spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
-             for np in doc.noun_chunks]
-    for start, end, tag, dep in spans:
-        doc.merge(start, end, tag=tag, dep=dep)
-    return doc
-
-
-def merge_entities(doc):
-    """Merge entities into a single token.
-
-    doc (Doc): The Doc object.
-    RETURNS (Doc): The Doc object with merged noun entities.
-    """
-    spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
-             for e in doc.ents]
-    for start, end, tag, dep, ent_type in spans:
-        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
-    return doc
-
-
-def merge_subtokens(doc, label='subtok'):
-    merger = Matcher(doc.vocab)
-    merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
-    matches = merger(doc)
-    spans = [doc[start:end+1] for _, start, end in matches]
-    offsets = [(span.start_char, span.end_char) for span in spans]
-    for start_char, end_char in offsets:
-        doc.merge(start_char, end_char)
-    return doc
-
-
-class EntityRuler(object):
-    name = 'entity_ruler'
-
-    def __init__(self, nlp, **cfg):
-        """Initialise the entitiy ruler. If patterns are supplied here, they
-        need to be a list of dictionaries with a `"label"` and `"pattern"`
-        key. A pattern can either be a token pattern (list) or a phrase pattern
-        (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
-
-        nlp (Language): The shared nlp object to pass the vocab to the matchers
-            and process phrase patterns.
-        patterns (iterable): Optional patterns to load in.
-        overwrite_ents (bool): If existing entities are present, e.g. entities
-            added by the model, overwrite them by matches if necessary.
-        **cfg: Other config parameters. If pipeline component is loaded as part
-            of a model pipeline, this will include all keyword arguments passed
-            to `spacy.load`.
-        RETURNS (EntityRuler): The newly constructed object.
-        """
-        self.nlp = nlp
-        self.overwrite = cfg.get('overwrite_ents', False)
-        self.token_patterns = defaultdict(list)
-        self.phrase_patterns = defaultdict(list)
-        self.matcher = Matcher(nlp.vocab)
-        self.phrase_matcher = PhraseMatcher(nlp.vocab)
-        patterns = cfg.get('patterns')
-        if patterns is not None:
-            self.add_patterns(patterns)
-
-    def __len__(self):
-        """The number of all patterns added to the entity ruler."""
-        n_token_patterns = sum(len(p) for p in self.token_patterns.values())
-        n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
-        return n_token_patterns + n_phrase_patterns
-
-    def __contains__(self, label):
-        """Whether a label is present in the patterns."""
-        return label in self.token_patterns or label in self.phrase_patterns
-
-    def __call__(self, doc):
-        """Find matches in document and add them as entities.
-
-        doc (Doc): The Doc object in the pipeline.
-        RETURNS (Doc): The Doc with added entities, if available.
-        """
-        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
-        matches = set([(m_id, start, end) for m_id, start, end in matches
-                       if start != end])
-        get_sort_key = lambda m: (m[2] - m[1], m[1])
-        matches = sorted(matches, key=get_sort_key, reverse=True)
-        entities = list(doc.ents)
-        new_entities = []
-        seen_tokens = set()
-        for match_id, start, end in matches:
-            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
-                continue
-            # check for end - 1 here because boundaries are inclusive
-            if start not in seen_tokens and end - 1 not in seen_tokens:
-                new_entities.append(Span(doc, start, end, label=match_id))
-                entities = [e for e in entities
-                            if not (e.start < end and e.end > start)]
-                seen_tokens.update(range(start, end))
-        doc.ents = entities + new_entities
-        return doc
-
-    @property
-    def labels(self):
-        """All labels present in the match patterns.
-
-        RETURNS (set): The string labels.
-        """
-        all_labels = set(self.token_patterns.keys())
-        all_labels.update(self.phrase_patterns.keys())
-        return all_labels
-
-    @property
-    def patterns(self):
-        """Get all patterns that were added to the entity ruler.
-
-        RETURNS (list): The original patterns, one dictionary per pattern.
-        """
-        all_patterns = []
-        for label, patterns in self.token_patterns.items():
-            for pattern in patterns:
-                all_patterns.append({'label': label, 'pattern': pattern})
-        for label, patterns in self.phrase_patterns.items():
-            for pattern in patterns:
-                all_patterns.append({'label': label, 'pattern': pattern.text})
-        return all_patterns
-
-    def add_patterns(self, patterns):
-        """Add patterns to the entitiy ruler. A pattern can either be a token
-        pattern (list of dicts) or a phrase pattern (string). For example:
-        {'label': 'ORG', 'pattern': 'Apple'}
-        {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
-
-        patterns (list): The patterns to add.
-        """
-        for entry in patterns:
-            label = entry['label']
-            pattern = entry['pattern']
-            if isinstance(pattern, basestring_):
-                self.phrase_patterns[label].append(self.nlp(pattern))
-            elif isinstance(pattern, list):
-                self.token_patterns[label].append(pattern)
+def _load_cfg(path):
+    if path.exists():
+        return srsly.read_json(path)
    else:
-                raise ValueError(Errors.E097.format(pattern=pattern))
-        for label, patterns in self.token_patterns.items():
-            self.matcher.add(label, None, *patterns)
-        for label, patterns in self.phrase_patterns.items():
-            self.phrase_matcher.add(label, None, *patterns)
-
-    def from_bytes(self, patterns_bytes, **kwargs):
-        """Load the entity ruler from a bytestring.
-
-        patterns_bytes (bytes): The bytestring to load.
-        **kwargs: Other config paramters, mostly for consistency.
-        RETURNS (EntityRuler): The loaded entity ruler.
-        """
-        patterns = srsly.msgpack_loads(patterns_bytes)
-        self.add_patterns(patterns)
-        return self
-
-    def to_bytes(self, **kwargs):
-        """Serialize the entity ruler patterns to a bytestring.
-
-        RETURNS (bytes): The serialized patterns.
-        """
-        return srsly.msgpack_dumps(self.patterns)
-
-    def from_disk(self, path, **kwargs):
-        """Load the entity ruler from a file. Expects a file containing
-        newline-delimited JSON (JSONL) with one entry per line.
-
-        path (unicode / Path): The JSONL file to load.
-        **kwargs: Other config paramters, mostly for consistency.
-        RETURNS (EntityRuler): The loaded entity ruler.
-        """
-        path = util.ensure_path(path)
-        path = path.with_suffix('.jsonl')
-        patterns = srsly.read_jsonl(path)
-        self.add_patterns(patterns)
-        return self
-
-    def to_disk(self, path, **kwargs):
-        """Save the entity ruler patterns to a directory. The patterns will be
-        saved as newline-delimited JSON (JSONL).
-
-        path (unicode / Path): The JSONL file to load.
-        **kwargs: Other config paramters, mostly for consistency.
-        RETURNS (EntityRuler): The loaded entity ruler.
-        """
-        path = util.ensure_path(path)
-        path = path.with_suffix('.jsonl')
-        srsly.write_jsonl(path, self.patterns)
+        return {}


 class Pipe(object):
@ -275,6 +43,7 @@ class Pipe(object):
    it defines the interface that components should follow to function as
    components in a spaCy analysis pipeline.
    """
+
    name = None

    @classmethod
@ -300,7 +69,7 @@ class Pipe(object):

    def require_model(self):
        """Raise an error if the component's model is not initialized."""
-        if getattr(self, 'model', None) in (None, True, False):
+        if getattr(self, "model", None) in (None, True, False):
            raise ValueError(Errors.E109.format(name=self.name))

    def pipe(self, stream, batch_size=128, n_threads=-1):
@ -326,7 +95,7 @@ class Pipe(object):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError

-    def update(self, docs, golds, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
        """Learn from a batch of documents and gold-standard information,
        updating the pipe's model.

@ -353,11 +122,11 @@ class Pipe(object):
        raise NotImplementedError

    def create_optimizer(self):
-        return create_default_optimizer(self.model.ops,
-                                        **self.cfg.get('optimizer', {}))
+        return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))

-    def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
-                       **kwargs):
+    def begin_training(
+        self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
+    ):
        """Initialize the pipe for training, using data exampes if available.
        If no model has been initialized yet, the model is added."""
        if self.model is True:
@ -375,70 +144,70 @@ class Pipe(object):
    def to_bytes(self, **exclude):
        """Serialize the pipe to a bytestring."""
        serialize = OrderedDict()
-        serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
+        serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
        if self.model in (True, False, None):
-            serialize['model'] = lambda: self.model
+            serialize["model"] = lambda: self.model
        else:
-            serialize['model'] = self.model.to_bytes
-        serialize['vocab'] = self.vocab.to_bytes
+            serialize["model"] = self.model.to_bytes
+        serialize["vocab"] = self.vocab.to_bytes
        return util.to_bytes(serialize, exclude)

    def from_bytes(self, bytes_data, **exclude):
        """Load the pipe from a bytestring."""
+
        def load_model(b):
            # TODO: Remove this once we don't have to handle previous models
-            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
-                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
+            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
+                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(**self.cfg)
            self.model.from_bytes(b)

-        deserialize = OrderedDict((
-            ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
-            ('vocab', lambda b: self.vocab.from_bytes(b)),
-            ('model', load_model),
-        ))
+        deserialize = OrderedDict(
+            (
+                ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
+                ("vocab", lambda b: self.vocab.from_bytes(b)),
+                ("model", load_model),
+            )
+        )
        util.from_bytes(bytes_data, deserialize, exclude)
        return self

    def to_disk(self, path, **exclude):
        """Serialize the pipe to disk."""
        serialize = OrderedDict()
-        serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
-        serialize['vocab'] = lambda p: self.vocab.to_disk(p)
+        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
+        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
        if self.model not in (None, True, False):
-            serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
+            serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
        util.to_disk(path, serialize, exclude)

    def from_disk(self, path, **exclude):
        """Load the pipe from disk."""
+
        def load_model(p):
            # TODO: Remove this once we don't have to handle previous models
-            if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
-                self.cfg['pretrained_vectors'] = self.vocab.vectors.name
+            if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
+                self.cfg["pretrained_vectors"] = self.vocab.vectors.name
            if self.model is True:
                self.model = self.Model(**self.cfg)
-            self.model.from_bytes(p.open('rb').read())
+            self.model.from_bytes(p.open("rb").read())

-        deserialize = OrderedDict((
-            ('cfg', lambda p: self.cfg.update(_load_cfg(p))),
-            ('vocab', lambda p: self.vocab.from_disk(p)),
-            ('model', load_model),
-        ))
+        deserialize = OrderedDict(
+            (
+                ("cfg", lambda p: self.cfg.update(_load_cfg(p))),
+                ("vocab", lambda p: self.vocab.from_disk(p)),
+                ("model", load_model),
+            )
+        )
        util.from_disk(path, deserialize, exclude)
        return self


-def _load_cfg(path):
-    if path.exists():
-        return srsly.read_json(path)
-    else:
-        return {}
-
-
 class Tensorizer(Pipe):
    """Pre-train position-sensitive vectors for tokens."""
-    name = 'tensorizer'
+
+    name = "tensorizer"

    @classmethod
    def Model(cls, output_size=300, **cfg):
@ -449,7 +218,7 @@ class Tensorizer(Pipe):
        **cfg: Config parameters.
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
        """
-        input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
+        input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
        return zero_init(Affine(output_size, input_size, drop_factor=0.0))

    def __init__(self, vocab, model=True, **cfg):
@ -470,7 +239,7 @@ class Tensorizer(Pipe):
        self.model = model
        self.input_models = []
        self.cfg = dict(cfg)
-        self.cfg.setdefault('cnn_maxout_pieces', 3)
+        self.cfg.setdefault("cnn_maxout_pieces", 3)

    def __call__(self, doc):
        """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
@ -516,10 +285,12 @@ class Tensorizer(Pipe):
        """
        for doc, tensor in zip(docs, tensors):
            if tensor.shape[0] != len(doc):
-                raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
+                raise ValueError(
+                    Errors.E076.format(rows=tensor.shape[0], words=len(doc))
+                )
            doc.tensor = tensor

-    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
+    def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
        """Update the model.

        docs (iterable): A batch of `Doc` objects.
@ -545,7 +316,7 @@ class Tensorizer(Pipe):
        for d_input, bp_input in zip(d_inputs, bp_inputs):
            bp_input(d_input, sgd=sgd)
        if losses is not None:
-            losses.setdefault(self.name, 0.)
+            losses.setdefault(self.name, 0.0)
            losses[self.name] += loss
        return loss

@ -553,11 +324,10 @@ class Tensorizer(Pipe):
        ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
        target = self.vocab.vectors.data[ids]
        d_scores = (prediction - target) / prediction.shape[0]
-        loss = (d_scores**2).sum()
+        loss = (d_scores ** 2).sum()
        return loss, d_scores

-    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
-                        **kwargs):
+    def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
        """Allocate models, pre-process training data and acquire an
        optimizer.

@ -566,7 +336,7 @@ class Tensorizer(Pipe):
        """
        if pipeline is not None:
            for name, model in pipeline:
-                if getattr(model, 'tok2vec', None):
+                if getattr(model, "tok2vec", None):
                    self.input_models.append(model.tok2vec)
        if self.model is True:
            self.model = self.Model(**self.cfg)
@ -1086,61 +856,6 @@ class ClozeMultitask(Pipe):
            losses[self.name] += loss


-class SimilarityHook(Pipe):
-    """
-    Experimental: A pipeline component to install a hook for supervised
-    similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
-    documents. The similarity model can be any object obeying the Thinc `Model`
-    interface. By default, the model concatenates the elementwise mean and
-    elementwise max of the two tensors, and compares them using the
-    Cauchy-like similarity function from Chen (2013):
-
-        >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
-
-    Where W is a vector of dimension weights, initialized to 1.
-    """
-    name = 'similarity'
-
-    def __init__(self, vocab, model=True, **cfg):
-        self.vocab = vocab
-        self.model = model
-        self.cfg = dict(cfg)
-
-    @classmethod
-    def Model(cls, length):
-        return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
-
-    def __call__(self, doc):
-        """Install similarity hook"""
-        doc.user_hooks['similarity'] = self.predict
-        return doc
-
-    def pipe(self, docs, **kwargs):
-        for doc in docs:
-            yield self(doc)
-
-    def predict(self, doc1, doc2):
-        self.require_model()
-        return self.model.predict([(doc1, doc2)])
-
-    def update(self, doc1_doc2, golds, sgd=None, drop=0.):
-        self.require_model()
-        sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
-
-    def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
-        """Allocate model, using width from tensorizer in pipeline.
-
-        gold_tuples (iterable): Gold-standard training data.
-        pipeline (list): The pipeline the model is part of.
-        """
-        if self.model is True:
-            self.model = self.Model(pipeline[0].model.nO)
-            link_vectors_to_models(self.vocab)
-        if sgd is None:
-            sgd = self.create_optimizer()
-        return sgd
-
-
 class TextCategorizer(Pipe):
    name = 'textcat'

@ -1299,13 +1014,12 @@ cdef class DependencyParser(Parser):


 cdef class EntityRecognizer(Parser):
-    name = 'ner'
+    name = "ner"
    TransitionSystem = BiluoPushDown
-
    nr_feature = 6

    def add_multitask_objective(self, target):
-        if target == 'cloze':
+        if target == "cloze":
            cloze = ClozeMultitask(self.vocab)
            self._multitasks.append(cloze)
        else:
@ -1326,8 +1040,8 @@ cdef class EntityRecognizer(Parser):
    def labels(self):
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
-        return [move.split('-')[1] for move in self.move_names
-                if move[0] in ('B', 'I', 'L', 'U')]
+        return [move.split("-")[1] for move in self.move_names
+                if move[0] in ("B", "I", "L", "U")]


 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']