From a9f8d1763298c406e79ac8534a4d1a780b171312 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 10 Feb 2019 12:14:51 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Break=20up=20large=20pipeline.py?= =?UTF-8?q?x=20(#3246)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Break up large pipeline.pyx * Merge some components back together * Fix typo --- setup.py | 2 +- spacy/pipeline.pxd | 0 spacy/pipeline/__init__.py | 8 + spacy/pipeline/entityruler.py | 170 ++++++++ spacy/pipeline/functions.py | 48 +++ spacy/pipeline/hooks.py | 100 +++++ spacy/{pipeline.pyx => pipeline/pipes.pyx} | 432 ++++----------------- 7 files changed, 400 insertions(+), 360 deletions(-) delete mode 100644 spacy/pipeline.pxd create mode 100644 spacy/pipeline/__init__.py create mode 100644 spacy/pipeline/entityruler.py create mode 100644 spacy/pipeline/functions.py create mode 100644 spacy/pipeline/hooks.py rename spacy/{pipeline.pyx => pipeline/pipes.pyx} (72%) diff --git a/setup.py b/setup.py index e3705255c..214f32c14 100755 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ MOD_NAMES = [ "spacy.vocab", "spacy.attrs", "spacy.morphology", - "spacy.pipeline", + "spacy.pipeline.pipes", "spacy.syntax.stateclass", "spacy.syntax._state", "spacy.tokenizer", diff --git a/spacy/pipeline.pxd b/spacy/pipeline.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py new file mode 100644 index 000000000..d683cc989 --- /dev/null +++ b/spacy/pipeline/__init__.py @@ -0,0 +1,8 @@ +# coding: utf8 +from __future__ import unicode_literals + +from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa +from .pipes import TextCategorizer, Tensorizer, Pipe # noqa +from .entityruler import EntityRuler # noqa +from .hooks import SentenceSegmenter, SimilarityHook # noqa +from .functions import merge_entities, merge_noun_chunks, merge_subtokens # noqa diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py new file mode 100644 index 000000000..bef929411 --- /dev/null +++ b/spacy/pipeline/entityruler.py @@ -0,0 +1,170 @@ +# coding: utf8 +from __future__ import unicode_literals + +from collections import defaultdict +import srsly + +from ..errors import Errors +from ..compat import basestring_ +from ..util import ensure_path +from ..tokens import Span +from ..matcher import Matcher, PhraseMatcher + + +class EntityRuler(object): + name = "entity_ruler" + + def __init__(self, nlp, **cfg): + """Initialise the entitiy ruler. If patterns are supplied here, they + need to be a list of dictionaries with a `"label"` and `"pattern"` + key. A pattern can either be a token pattern (list) or a phrase pattern + (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. + + nlp (Language): The shared nlp object to pass the vocab to the matchers + and process phrase patterns. + patterns (iterable): Optional patterns to load in. + overwrite_ents (bool): If existing entities are present, e.g. entities + added by the model, overwrite them by matches if necessary. + **cfg: Other config parameters. If pipeline component is loaded as part + of a model pipeline, this will include all keyword arguments passed + to `spacy.load`. + RETURNS (EntityRuler): The newly constructed object. + """ + self.nlp = nlp + self.overwrite = cfg.get("overwrite_ents", False) + self.token_patterns = defaultdict(list) + self.phrase_patterns = defaultdict(list) + self.matcher = Matcher(nlp.vocab) + self.phrase_matcher = PhraseMatcher(nlp.vocab) + patterns = cfg.get("patterns") + if patterns is not None: + self.add_patterns(patterns) + + def __len__(self): + """The number of all patterns added to the entity ruler.""" + n_token_patterns = sum(len(p) for p in self.token_patterns.values()) + n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values()) + return n_token_patterns + n_phrase_patterns + + def __contains__(self, label): + """Whether a label is present in the patterns.""" + return label in self.token_patterns or label in self.phrase_patterns + + def __call__(self, doc): + """Find matches in document and add them as entities. + + doc (Doc): The Doc object in the pipeline. + RETURNS (Doc): The Doc with added entities, if available. + """ + matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) + matches = set( + [(m_id, start, end) for m_id, start, end in matches if start != end] + ) + get_sort_key = lambda m: (m[2] - m[1], m[1]) + matches = sorted(matches, key=get_sort_key, reverse=True) + entities = list(doc.ents) + new_entities = [] + seen_tokens = set() + for match_id, start, end in matches: + if any(t.ent_type for t in doc[start:end]) and not self.overwrite: + continue + # check for end - 1 here because boundaries are inclusive + if start not in seen_tokens and end - 1 not in seen_tokens: + new_entities.append(Span(doc, start, end, label=match_id)) + entities = [ + e for e in entities if not (e.start < end and e.end > start) + ] + seen_tokens.update(range(start, end)) + doc.ents = entities + new_entities + return doc + + @property + def labels(self): + """All labels present in the match patterns. + + RETURNS (set): The string labels. + """ + all_labels = set(self.token_patterns.keys()) + all_labels.update(self.phrase_patterns.keys()) + return all_labels + + @property + def patterns(self): + """Get all patterns that were added to the entity ruler. + + RETURNS (list): The original patterns, one dictionary per pattern. + """ + all_patterns = [] + for label, patterns in self.token_patterns.items(): + for pattern in patterns: + all_patterns.append({"label": label, "pattern": pattern}) + for label, patterns in self.phrase_patterns.items(): + for pattern in patterns: + all_patterns.append({"label": label, "pattern": pattern.text}) + return all_patterns + + def add_patterns(self, patterns): + """Add patterns to the entitiy ruler. A pattern can either be a token + pattern (list of dicts) or a phrase pattern (string). For example: + {'label': 'ORG', 'pattern': 'Apple'} + {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} + + patterns (list): The patterns to add. + """ + for entry in patterns: + label = entry["label"] + pattern = entry["pattern"] + if isinstance(pattern, basestring_): + self.phrase_patterns[label].append(self.nlp(pattern)) + elif isinstance(pattern, list): + self.token_patterns[label].append(pattern) + else: + raise ValueError(Errors.E097.format(pattern=pattern)) + for label, patterns in self.token_patterns.items(): + self.matcher.add(label, None, *patterns) + for label, patterns in self.phrase_patterns.items(): + self.phrase_matcher.add(label, None, *patterns) + + def from_bytes(self, patterns_bytes, **kwargs): + """Load the entity ruler from a bytestring. + + patterns_bytes (bytes): The bytestring to load. + **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. + """ + patterns = srsly.msgpack_loads(patterns_bytes) + self.add_patterns(patterns) + return self + + def to_bytes(self, **kwargs): + """Serialize the entity ruler patterns to a bytestring. + + RETURNS (bytes): The serialized patterns. + """ + return srsly.msgpack_dumps(self.patterns) + + def from_disk(self, path, **kwargs): + """Load the entity ruler from a file. Expects a file containing + newline-delimited JSON (JSONL) with one entry per line. + + path (unicode / Path): The JSONL file to load. + **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. + """ + path = ensure_path(path) + path = path.with_suffix(".jsonl") + patterns = srsly.read_jsonl(path) + self.add_patterns(patterns) + return self + + def to_disk(self, path, **kwargs): + """Save the entity ruler patterns to a directory. The patterns will be + saved as newline-delimited JSON (JSONL). + + path (unicode / Path): The JSONL file to load. + **kwargs: Other config paramters, mostly for consistency. + RETURNS (EntityRuler): The loaded entity ruler. + """ + path = ensure_path(path) + path = path.with_suffix(".jsonl") + srsly.write_jsonl(path, self.patterns) diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py new file mode 100644 index 000000000..23000d948 --- /dev/null +++ b/spacy/pipeline/functions.py @@ -0,0 +1,48 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..matcher import Matcher + + +# TODO: replace doc.merge with doc.retokenize + + +def merge_noun_chunks(doc): + """Merge noun chunks into a single token. + + doc (Doc): The Doc object. + RETURNS (Doc): The Doc object with merged noun chunks. + """ + if not doc.is_parsed: + return doc + spans = [ + (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks + ] + for start, end, tag, dep in spans: + doc.merge(start, end, tag=tag, dep=dep) + return doc + + +def merge_entities(doc): + """Merge entities into a single token. + + doc (Doc): The Doc object. + RETURNS (Doc): The Doc object with merged noun entities. + """ + spans = [ + (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents + ] + for start, end, tag, dep, ent_type in spans: + doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type) + return doc + + +def merge_subtokens(doc, label="subtok"): + merger = Matcher(doc.vocab) + merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}]) + matches = merger(doc) + spans = [doc[start : end + 1] for _, start, end in matches] + offsets = [(span.start_char, span.end_char) for span in spans] + for start_char, end_char in offsets: + doc.merge(start_char, end_char) + return doc diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py new file mode 100644 index 000000000..7e9d09c54 --- /dev/null +++ b/spacy/pipeline/hooks.py @@ -0,0 +1,100 @@ +# coding: utf8 +from __future__ import unicode_literals + +from thinc.t2v import Pooling, max_pool, mean_pool +from thinc.neural._classes.difference import Siamese, CauchySimilarity + +from .pipes import Pipe +from .._ml import link_vectors_to_models + + +class SentenceSegmenter(object): + """A simple spaCy hook, to allow custom sentence boundary detection logic + (that doesn't require the dependency parse). To change the sentence + boundary detection strategy, pass a generator function `strategy` on + initialization, or assign a new strategy to the .strategy attribute. + Sentence detection strategies should be generators that take `Doc` objects + and yield `Span` objects for each sentence. + """ + + name = "sentencizer" + + def __init__(self, vocab, strategy=None): + self.vocab = vocab + if strategy is None or strategy == "on_punct": + strategy = self.split_on_punct + self.strategy = strategy + + def __call__(self, doc): + doc.user_hooks["sents"] = self.strategy + return doc + + @staticmethod + def split_on_punct(doc): + start = 0 + seen_period = False + for i, word in enumerate(doc): + if seen_period and not word.is_punct: + yield doc[start : word.i] + start = word.i + seen_period = False + elif word.text in [".", "!", "?"]: + seen_period = True + if start < len(doc): + yield doc[start : len(doc)] + + +class SimilarityHook(Pipe): + """ + Experimental: A pipeline component to install a hook for supervised + similarity into `Doc` objects. Requires a `Tensorizer` to pre-process + documents. The similarity model can be any object obeying the Thinc `Model` + interface. By default, the model concatenates the elementwise mean and + elementwise max of the two tensors, and compares them using the + Cauchy-like similarity function from Chen (2013): + + >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) + + Where W is a vector of dimension weights, initialized to 1. + """ + + name = "similarity" + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self.cfg = dict(cfg) + + @classmethod + def Model(cls, length): + return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) + + def __call__(self, doc): + """Install similarity hook""" + doc.user_hooks["similarity"] = self.predict + return doc + + def pipe(self, docs, **kwargs): + for doc in docs: + yield self(doc) + + def predict(self, doc1, doc2): + self.require_model() + return self.model.predict([(doc1, doc2)]) + + def update(self, doc1_doc2, golds, sgd=None, drop=0.0): + self.require_model() + sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) + + def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): + """Allocate model, using width from tensorizer in pipeline. + + gold_tuples (iterable): Gold-standard training data. + pipeline (list): The pipeline the model is part of. + """ + if self.model is True: + self.model = self.Model(pipeline[0].model.nO) + link_vectors_to_models(self.vocab) + if sgd is None: + sgd = self.create_optimizer() + return sgd diff --git a/spacy/pipeline.pyx b/spacy/pipeline/pipes.pyx similarity index 72% rename from spacy/pipeline.pyx rename to spacy/pipeline/pipes.pyx index 45cba61d2..92fecb8b3 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,271 +3,39 @@ # coding: utf8 from __future__ import unicode_literals -import numpy cimport numpy as np -from collections import OrderedDict, defaultdict + +import numpy +from collections import OrderedDict import srsly from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm -from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural.util import to_categorical, copy_array -from thinc.neural._classes.difference import Siamese, CauchySimilarity -from .tokens.doc cimport Doc -from .syntax.nn_parser cimport Parser -from .syntax import nonproj -from .syntax.ner cimport BiluoPushDown -from .syntax.arc_eager cimport ArcEager -from .morphology cimport Morphology -from .vocab cimport Vocab -from .syntax import nonproj -from .matcher import Matcher +from ..tokens.doc cimport Doc +from ..syntax.nn_parser cimport Parser +from ..syntax.ner cimport BiluoPushDown +from ..syntax.arc_eager cimport ArcEager +from ..morphology cimport Morphology +from ..vocab cimport Vocab -from .matcher import Matcher, PhraseMatcher -from .tokens.span import Span -from .attrs import POS, ID -from .parts_of_speech import X -from ._ml import Tok2Vec, build_text_classifier, build_tagger_model -from ._ml import build_simple_cnn_text_classifier -from ._ml import link_vectors_to_models, zero_init, flatten -from ._ml import create_default_optimizer -from ._ml import masked_language_model -from .errors import Errors, TempErrors -from .compat import basestring_ -from . import util +from ..syntax import nonproj +from ..attrs import POS, ID +from ..parts_of_speech import X +from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier +from .._ml import link_vectors_to_models, zero_init, flatten +from .._ml import masked_language_model, create_default_optimizer +from ..errors import Errors, TempErrors +from .. import util -class SentenceSegmenter(object): - """A simple spaCy hook, to allow custom sentence boundary detection logic - (that doesn't require the dependency parse). To change the sentence - boundary detection strategy, pass a generator function `strategy` on - initialization, or assign a new strategy to the .strategy attribute. - Sentence detection strategies should be generators that take `Doc` objects - and yield `Span` objects for each sentence. - """ - name = 'sentencizer' - - def __init__(self, vocab, strategy=None): - self.vocab = vocab - if strategy is None or strategy == 'on_punct': - strategy = self.split_on_punct - self.strategy = strategy - - def __call__(self, doc): - doc.user_hooks['sents'] = self.strategy - return doc - - @staticmethod - def split_on_punct(doc): - start = 0 - seen_period = False - for i, word in enumerate(doc): - if seen_period and not word.is_punct: - yield doc[start:word.i] - start = word.i - seen_period = False - elif word.text in ['.', '!', '?']: - seen_period = True - if start < len(doc): - yield doc[start:len(doc)] - - -def merge_noun_chunks(doc): - """Merge noun chunks into a single token. - - doc (Doc): The Doc object. - RETURNS (Doc): The Doc object with merged noun chunks. - """ - if not doc.is_parsed: - return doc - spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep) - for np in doc.noun_chunks] - for start, end, tag, dep in spans: - doc.merge(start, end, tag=tag, dep=dep) - return doc - - -def merge_entities(doc): - """Merge entities into a single token. - - doc (Doc): The Doc object. - RETURNS (Doc): The Doc object with merged noun entities. - """ - spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) - for e in doc.ents] - for start, end, tag, dep, ent_type in spans: - doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type) - return doc - - -def merge_subtokens(doc, label='subtok'): - merger = Matcher(doc.vocab) - merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}]) - matches = merger(doc) - spans = [doc[start:end+1] for _, start, end in matches] - offsets = [(span.start_char, span.end_char) for span in spans] - for start_char, end_char in offsets: - doc.merge(start_char, end_char) - return doc - - -class EntityRuler(object): - name = 'entity_ruler' - - def __init__(self, nlp, **cfg): - """Initialise the entitiy ruler. If patterns are supplied here, they - need to be a list of dictionaries with a `"label"` and `"pattern"` - key. A pattern can either be a token pattern (list) or a phrase pattern - (string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`. - - nlp (Language): The shared nlp object to pass the vocab to the matchers - and process phrase patterns. - patterns (iterable): Optional patterns to load in. - overwrite_ents (bool): If existing entities are present, e.g. entities - added by the model, overwrite them by matches if necessary. - **cfg: Other config parameters. If pipeline component is loaded as part - of a model pipeline, this will include all keyword arguments passed - to `spacy.load`. - RETURNS (EntityRuler): The newly constructed object. - """ - self.nlp = nlp - self.overwrite = cfg.get('overwrite_ents', False) - self.token_patterns = defaultdict(list) - self.phrase_patterns = defaultdict(list) - self.matcher = Matcher(nlp.vocab) - self.phrase_matcher = PhraseMatcher(nlp.vocab) - patterns = cfg.get('patterns') - if patterns is not None: - self.add_patterns(patterns) - - def __len__(self): - """The number of all patterns added to the entity ruler.""" - n_token_patterns = sum(len(p) for p in self.token_patterns.values()) - n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values()) - return n_token_patterns + n_phrase_patterns - - def __contains__(self, label): - """Whether a label is present in the patterns.""" - return label in self.token_patterns or label in self.phrase_patterns - - def __call__(self, doc): - """Find matches in document and add them as entities. - - doc (Doc): The Doc object in the pipeline. - RETURNS (Doc): The Doc with added entities, if available. - """ - matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc)) - matches = set([(m_id, start, end) for m_id, start, end in matches - if start != end]) - get_sort_key = lambda m: (m[2] - m[1], m[1]) - matches = sorted(matches, key=get_sort_key, reverse=True) - entities = list(doc.ents) - new_entities = [] - seen_tokens = set() - for match_id, start, end in matches: - if any(t.ent_type for t in doc[start:end]) and not self.overwrite: - continue - # check for end - 1 here because boundaries are inclusive - if start not in seen_tokens and end - 1 not in seen_tokens: - new_entities.append(Span(doc, start, end, label=match_id)) - entities = [e for e in entities - if not (e.start < end and e.end > start)] - seen_tokens.update(range(start, end)) - doc.ents = entities + new_entities - return doc - - @property - def labels(self): - """All labels present in the match patterns. - - RETURNS (set): The string labels. - """ - all_labels = set(self.token_patterns.keys()) - all_labels.update(self.phrase_patterns.keys()) - return all_labels - - @property - def patterns(self): - """Get all patterns that were added to the entity ruler. - - RETURNS (list): The original patterns, one dictionary per pattern. - """ - all_patterns = [] - for label, patterns in self.token_patterns.items(): - for pattern in patterns: - all_patterns.append({'label': label, 'pattern': pattern}) - for label, patterns in self.phrase_patterns.items(): - for pattern in patterns: - all_patterns.append({'label': label, 'pattern': pattern.text}) - return all_patterns - - def add_patterns(self, patterns): - """Add patterns to the entitiy ruler. A pattern can either be a token - pattern (list of dicts) or a phrase pattern (string). For example: - {'label': 'ORG', 'pattern': 'Apple'} - {'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]} - - patterns (list): The patterns to add. - """ - for entry in patterns: - label = entry['label'] - pattern = entry['pattern'] - if isinstance(pattern, basestring_): - self.phrase_patterns[label].append(self.nlp(pattern)) - elif isinstance(pattern, list): - self.token_patterns[label].append(pattern) - else: - raise ValueError(Errors.E097.format(pattern=pattern)) - for label, patterns in self.token_patterns.items(): - self.matcher.add(label, None, *patterns) - for label, patterns in self.phrase_patterns.items(): - self.phrase_matcher.add(label, None, *patterns) - - def from_bytes(self, patterns_bytes, **kwargs): - """Load the entity ruler from a bytestring. - - patterns_bytes (bytes): The bytestring to load. - **kwargs: Other config paramters, mostly for consistency. - RETURNS (EntityRuler): The loaded entity ruler. - """ - patterns = srsly.msgpack_loads(patterns_bytes) - self.add_patterns(patterns) - return self - - def to_bytes(self, **kwargs): - """Serialize the entity ruler patterns to a bytestring. - - RETURNS (bytes): The serialized patterns. - """ - return srsly.msgpack_dumps(self.patterns) - - def from_disk(self, path, **kwargs): - """Load the entity ruler from a file. Expects a file containing - newline-delimited JSON (JSONL) with one entry per line. - - path (unicode / Path): The JSONL file to load. - **kwargs: Other config paramters, mostly for consistency. - RETURNS (EntityRuler): The loaded entity ruler. - """ - path = util.ensure_path(path) - path = path.with_suffix('.jsonl') - patterns = srsly.read_jsonl(path) - self.add_patterns(patterns) - return self - - def to_disk(self, path, **kwargs): - """Save the entity ruler patterns to a directory. The patterns will be - saved as newline-delimited JSON (JSONL). - - path (unicode / Path): The JSONL file to load. - **kwargs: Other config paramters, mostly for consistency. - RETURNS (EntityRuler): The loaded entity ruler. - """ - path = util.ensure_path(path) - path = path.with_suffix('.jsonl') - srsly.write_jsonl(path, self.patterns) +def _load_cfg(path): + if path.exists(): + return srsly.read_json(path) + else: + return {} class Pipe(object): @@ -275,6 +43,7 @@ class Pipe(object): it defines the interface that components should follow to function as components in a spaCy analysis pipeline. """ + name = None @classmethod @@ -300,7 +69,7 @@ class Pipe(object): def require_model(self): """Raise an error if the component's model is not initialized.""" - if getattr(self, 'model', None) in (None, True, False): + if getattr(self, "model", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) def pipe(self, stream, batch_size=128, n_threads=-1): @@ -326,7 +95,7 @@ class Pipe(object): """Modify a batch of documents, using pre-computed scores.""" raise NotImplementedError - def update(self, docs, golds, drop=0., sgd=None, losses=None): + def update(self, docs, golds, drop=0.0, sgd=None, losses=None): """Learn from a batch of documents and gold-standard information, updating the pipe's model. @@ -353,11 +122,11 @@ class Pipe(object): raise NotImplementedError def create_optimizer(self): - return create_default_optimizer(self.model.ops, - **self.cfg.get('optimizer', {})) + return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {})) - def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, - **kwargs): + def begin_training( + self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs + ): """Initialize the pipe for training, using data exampes if available. If no model has been initialized yet, the model is added.""" if self.model is True: @@ -375,70 +144,70 @@ class Pipe(object): def to_bytes(self, **exclude): """Serialize the pipe to a bytestring.""" serialize = OrderedDict() - serialize['cfg'] = lambda: srsly.json_dumps(self.cfg) + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) if self.model in (True, False, None): - serialize['model'] = lambda: self.model + serialize["model"] = lambda: self.model else: - serialize['model'] = self.model.to_bytes - serialize['vocab'] = self.vocab.to_bytes + serialize["model"] = self.model.to_bytes + serialize["vocab"] = self.vocab.to_bytes return util.to_bytes(serialize, exclude) def from_bytes(self, bytes_data, **exclude): """Load the pipe from a bytestring.""" + def load_model(b): # TODO: Remove this once we don't have to handle previous models - if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors.name + if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: + self.cfg["pretrained_vectors"] = self.vocab.vectors.name if self.model is True: self.model = self.Model(**self.cfg) self.model.from_bytes(b) - deserialize = OrderedDict(( - ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), - ('vocab', lambda b: self.vocab.from_bytes(b)), - ('model', load_model), - )) + deserialize = OrderedDict( + ( + ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), + ("vocab", lambda b: self.vocab.from_bytes(b)), + ("model", load_model), + ) + ) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, **exclude): """Serialize the pipe to disk.""" serialize = OrderedDict() - serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg) - serialize['vocab'] = lambda p: self.vocab.to_disk(p) + serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) + serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): - serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes()) + serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) util.to_disk(path, serialize, exclude) def from_disk(self, path, **exclude): """Load the pipe from disk.""" + def load_model(p): # TODO: Remove this once we don't have to handle previous models - if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg: - self.cfg['pretrained_vectors'] = self.vocab.vectors.name + if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg: + self.cfg["pretrained_vectors"] = self.vocab.vectors.name if self.model is True: self.model = self.Model(**self.cfg) - self.model.from_bytes(p.open('rb').read()) + self.model.from_bytes(p.open("rb").read()) - deserialize = OrderedDict(( - ('cfg', lambda p: self.cfg.update(_load_cfg(p))), - ('vocab', lambda p: self.vocab.from_disk(p)), - ('model', load_model), - )) + deserialize = OrderedDict( + ( + ("cfg", lambda p: self.cfg.update(_load_cfg(p))), + ("vocab", lambda p: self.vocab.from_disk(p)), + ("model", load_model), + ) + ) util.from_disk(path, deserialize, exclude) return self -def _load_cfg(path): - if path.exists(): - return srsly.read_json(path) - else: - return {} - - class Tensorizer(Pipe): """Pre-train position-sensitive vectors for tokens.""" - name = 'tensorizer' + + name = "tensorizer" @classmethod def Model(cls, output_size=300, **cfg): @@ -449,7 +218,7 @@ class Tensorizer(Pipe): **cfg: Config parameters. RETURNS (Model): A `thinc.neural.Model` or similar instance. """ - input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96)) + input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96)) return zero_init(Affine(output_size, input_size, drop_factor=0.0)) def __init__(self, vocab, model=True, **cfg): @@ -470,7 +239,7 @@ class Tensorizer(Pipe): self.model = model self.input_models = [] self.cfg = dict(cfg) - self.cfg.setdefault('cnn_maxout_pieces', 3) + self.cfg.setdefault("cnn_maxout_pieces", 3) def __call__(self, doc): """Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM @@ -516,10 +285,12 @@ class Tensorizer(Pipe): """ for doc, tensor in zip(docs, tensors): if tensor.shape[0] != len(doc): - raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) + raise ValueError( + Errors.E076.format(rows=tensor.shape[0], words=len(doc)) + ) doc.tensor = tensor - def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): + def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): """Update the model. docs (iterable): A batch of `Doc` objects. @@ -545,7 +316,7 @@ class Tensorizer(Pipe): for d_input, bp_input in zip(d_inputs, bp_inputs): bp_input(d_input, sgd=sgd) if losses is not None: - losses.setdefault(self.name, 0.) + losses.setdefault(self.name, 0.0) losses[self.name] += loss return loss @@ -553,11 +324,10 @@ class Tensorizer(Pipe): ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = self.vocab.vectors.data[ids] d_scores = (prediction - target) / prediction.shape[0] - loss = (d_scores**2).sum() + loss = (d_scores ** 2).sum() return loss, d_scores - def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, - **kwargs): + def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): """Allocate models, pre-process training data and acquire an optimizer. @@ -566,7 +336,7 @@ class Tensorizer(Pipe): """ if pipeline is not None: for name, model in pipeline: - if getattr(model, 'tok2vec', None): + if getattr(model, "tok2vec", None): self.input_models.append(model.tok2vec) if self.model is True: self.model = self.Model(**self.cfg) @@ -1086,61 +856,6 @@ class ClozeMultitask(Pipe): losses[self.name] += loss -class SimilarityHook(Pipe): - """ - Experimental: A pipeline component to install a hook for supervised - similarity into `Doc` objects. Requires a `Tensorizer` to pre-process - documents. The similarity model can be any object obeying the Thinc `Model` - interface. By default, the model concatenates the elementwise mean and - elementwise max of the two tensors, and compares them using the - Cauchy-like similarity function from Chen (2013): - - >>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum()) - - Where W is a vector of dimension weights, initialized to 1. - """ - name = 'similarity' - - def __init__(self, vocab, model=True, **cfg): - self.vocab = vocab - self.model = model - self.cfg = dict(cfg) - - @classmethod - def Model(cls, length): - return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length)) - - def __call__(self, doc): - """Install similarity hook""" - doc.user_hooks['similarity'] = self.predict - return doc - - def pipe(self, docs, **kwargs): - for doc in docs: - yield self(doc) - - def predict(self, doc1, doc2): - self.require_model() - return self.model.predict([(doc1, doc2)]) - - def update(self, doc1_doc2, golds, sgd=None, drop=0.): - self.require_model() - sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop) - - def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs): - """Allocate model, using width from tensorizer in pipeline. - - gold_tuples (iterable): Gold-standard training data. - pipeline (list): The pipeline the model is part of. - """ - if self.model is True: - self.model = self.Model(pipeline[0].model.nO) - link_vectors_to_models(self.vocab) - if sgd is None: - sgd = self.create_optimizer() - return sgd - - class TextCategorizer(Pipe): name = 'textcat' @@ -1299,13 +1014,12 @@ cdef class DependencyParser(Parser): cdef class EntityRecognizer(Parser): - name = 'ner' + name = "ner" TransitionSystem = BiluoPushDown - nr_feature = 6 def add_multitask_objective(self, target): - if target == 'cloze': + if target == "cloze": cloze = ClozeMultitask(self.vocab) self._multitasks.append(cloze) else: @@ -1326,8 +1040,8 @@ cdef class EntityRecognizer(Parser): def labels(self): # Get the labels from the model by looking at the available moves, e.g. # B-PERSON, I-PERSON, L-PERSON, U-PERSON - return [move.split('-')[1] for move in self.move_names - if move[0] in ('B', 'I', 'L', 'U')] + return [move.split("-")[1] for move in self.move_names + if move[0] in ("B", "I", "L", "U")] __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']