diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index a3193a5cf..da8a714a7 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -61,6 +61,9 @@ def evaluate( "NER R": "%.2f" % scorer.ents_r, "NER F": "%.2f" % scorer.ents_f, "Textcat": "%.2f" % scorer.textcat_score, + "Sent P": "%.2f" % scorer.sent_p, + "Sent R": "%.2f" % scorer.sent_r, + "Sent F": "%.2f" % scorer.sent_f, } msg.table(results, title="Results") diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 645d1e4d4..8d37254a5 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -11,6 +11,7 @@ import srsly from wasabi import msg import contextlib import random +from collections import OrderedDict from .._ml import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG @@ -585,11 +586,13 @@ def _find_best(experiment_dir, component): def _get_metrics(component): if component == "parser": - return ("las", "uas", "token_acc") + return ("las", "uas", "token_acc", "sent_f") elif component == "tagger": return ("tags_acc",) elif component == "ner": return ("ents_f", "ents_p", "ents_r") + elif component == "sentrec": + return ("sent_p", "sent_r", "sent_f",) return ("token_acc",) @@ -601,14 +604,17 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): row_head.extend(["Tag Loss ", " Tag % "]) output_stats.extend(["tag_loss", "tags_acc"]) elif pipe == "parser": - row_head.extend(["Dep Loss ", " UAS ", " LAS "]) - output_stats.extend(["dep_loss", "uas", "las"]) + row_head.extend(["Dep Loss ", " UAS ", " LAS ", "Sent P", "Sent R", "Sent F"]) + output_stats.extend(["dep_loss", "uas", "las", "sent_p", "sent_r", "sent_f"]) elif pipe == "ner": row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "]) output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"]) elif pipe == "textcat": row_head.extend(["Textcat Loss", "Textcat"]) output_stats.extend(["textcat_loss", "textcat_score"]) + elif pipe == "sentrec": + row_head.extend(["Sentrec Loss", "Sent P", "Sent R", "Sent F"]) + output_stats.extend(["sentrec_loss", "sent_p", "sent_r", "sent_f"]) row_head.extend(["Token %", "CPU WPS"]) output_stats.extend(["token_acc", "cpu_wps"]) @@ -618,7 +624,12 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): if has_beam_widths: row_head.insert(1, "Beam W.") - return row_head, output_stats + # remove duplicates + row_head_dict = OrderedDict() + row_head_dict.update({k: 1 for k in row_head}) + output_stats_dict = OrderedDict() + output_stats_dict.update({k: 1 for k in output_stats}) + return row_head_dict.keys(), output_stats_dict.keys() def _get_progress( @@ -631,6 +642,7 @@ def _get_progress( scores["ner_loss"] = losses.get("ner", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0) scores["textcat_loss"] = losses.get("textcat", 0.0) + scores["sentrec_loss"] = losses.get("sentrec", 0.0) scores["cpu_wps"] = cpu_wps scores["gpu_wps"] = gpu_wps or 0.0 scores.update(dev_scores) diff --git a/spacy/gold.pxd b/spacy/gold.pxd index 247ff8aa1..525aa2473 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -26,6 +26,7 @@ cdef class GoldParse: cdef public list words cdef public list tags cdef public list morphs + cdef public list sent_starts cdef public list heads cdef public list labels cdef public dict orths diff --git a/spacy/gold.pyx b/spacy/gold.pyx index f2f127438..a7c0f1d8d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -497,9 +497,9 @@ def json_to_examples(doc): ner.append(token.get("ner", "-")) morphs.append(token.get("morph", {})) if i == 0: - sent_starts.append(True) + sent_starts.append(1) else: - sent_starts.append(False) + sent_starts.append(0) if "brackets" in sent: brackets.extend((b["first"] + sent_start_i, b["last"] + sent_start_i, b["label"]) @@ -759,7 +759,7 @@ cdef class Example: t = self.token_annotation split_examples = [] for i in range(len(t.words)): - if i > 0 and t.sent_starts[i] == True: + if i > 0 and t.sent_starts[i] == 1: s_example.set_token_annotation(ids=s_ids, words=s_words, tags=s_tags, heads=s_heads, deps=s_deps, entities=s_ents, morphs=s_morphs, @@ -892,6 +892,7 @@ cdef class GoldParse: deps=token_annotation.deps, entities=token_annotation.entities, morphs=token_annotation.morphs, + sent_starts=token_annotation.sent_starts, cats=doc_annotation.cats, links=doc_annotation.links, make_projective=make_projective) @@ -902,12 +903,13 @@ cdef class GoldParse: ids = list(range(len(self.words))) return TokenAnnotation(ids=ids, words=self.words, tags=self.tags, - heads=self.heads, deps=self.labels, entities=self.ner, - morphs=self.morphs) + heads=self.heads, deps=self.labels, + entities=self.ner, morphs=self.morphs, + sent_starts=self.sent_starts) def __init__(self, doc, words=None, tags=None, morphs=None, - heads=None, deps=None, entities=None, make_projective=False, - cats=None, links=None): + heads=None, deps=None, entities=None, sent_starts=None, + make_projective=False, cats=None, links=None): """Create a GoldParse. The fields will not be initialized if len(doc) is zero. doc (Doc): The document the annotations refer to. @@ -920,6 +922,8 @@ cdef class GoldParse: entities (iterable): A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. + sent_starts (iterable): A sequence of sentence position tags, 1 for + the first word in a sentence, 0 for all others. cats (dict): Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the @@ -956,6 +960,8 @@ cdef class GoldParse: deps = [None for _ in words] if not morphs: morphs = [None for _ in words] + if not sent_starts: + sent_starts = [None for _ in words] if entities is None: entities = ["-" for _ in words] elif len(entities) == 0: @@ -982,6 +988,7 @@ cdef class GoldParse: self.labels = [None] * len(doc) self.ner = [None] * len(doc) self.morphs = [None] * len(doc) + self.sent_starts = [None] * len(doc) # This needs to be done before we align the words if make_projective and heads is not None and deps is not None: @@ -1000,7 +1007,7 @@ cdef class GoldParse: self.gold_to_cand = [(i if i >= 0 else None) for i in j2i] self.orig = TokenAnnotation(ids=list(range(len(words))), words=words, tags=tags, - heads=heads, deps=deps, entities=entities, morphs=morphs, + heads=heads, deps=deps, entities=entities, morphs=morphs, sent_starts=sent_starts, brackets=[]) for i, gold_i in enumerate(self.cand_to_gold): @@ -1011,11 +1018,13 @@ cdef class GoldParse: self.labels[i] = None self.ner[i] = None self.morphs[i] = set() + self.sent_starts[i] = 0 if gold_i is None: if i in i2j_multi: self.words[i] = words[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]] self.morphs[i] = morphs[i2j_multi[i]] + self.sent_starts[i] = sent_starts[i2j_multi[i]] is_last = i2j_multi[i] != i2j_multi.get(i+1) is_first = i2j_multi[i] != i2j_multi.get(i-1) # Set next word in multi-token span as head, until last @@ -1055,6 +1064,7 @@ cdef class GoldParse: self.words[i] = words[gold_i] self.tags[i] = tags[gold_i] self.morphs[i] = morphs[gold_i] + self.sent_starts[i] = sent_starts[gold_i] if heads[gold_i] is None: self.heads[i] = None else: @@ -1091,21 +1101,6 @@ cdef class GoldParse: """ return not nonproj.is_nonproj_tree(self.heads) - property sent_starts: - def __get__(self): - return [self.c.sent_start[i] for i in range(self.length)] - - def __set__(self, sent_starts): - for gold_i, is_sent_start in enumerate(sent_starts): - i = self.gold_to_cand[gold_i] - if i is not None: - if is_sent_start in (1, True): - self.c.sent_start[i] = 1 - elif is_sent_start in (-1, False): - self.c.sent_start[i] = -1 - else: - self.c.sent_start[i] = 0 - def docs_to_json(docs, id=0): """Convert a list of Doc objects into the JSON-serializable format used by diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 2f30fbbee..de8403152 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer +from .pipes import SentenceRecognizer from .morphologizer import Morphologizer from .entityruler import EntityRuler from .hooks import SentenceSegmenter, SimilarityHook @@ -20,6 +21,7 @@ __all__ = [ "EntityRuler", "Sentencizer", "SentenceSegmenter", + "SentenceRecognizer", "SimilarityHook", "merge_entities", "merge_noun_chunks", diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 56a00e33b..110839acd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -705,6 +705,169 @@ class Tagger(Pipe): return self +@component("sentrec", assigns=["token.is_sent_start"]) +class SentenceRecognizer(Tagger): + """Pipeline component for sentence segmentation. + + DOCS: https://spacy.io/api/sentencerecognizer + """ + + def __init__(self, vocab, model=True, **cfg): + self.vocab = vocab + self.model = model + self._rehearsal_model = None + self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg.setdefault("cnn_maxout_pieces", 2) + self.cfg.setdefault("subword_features", True) + self.cfg.setdefault("token_vector_width", 12) + self.cfg.setdefault("conv_depth", 1) + self.cfg.setdefault("pretrained_vectors", None) + + @property + def labels(self): + # labels are numbered by index internally, so this matches GoldParse + # and Example where the sentence-initial tag is 1 and other positions + # are 0 + return tuple(["I", "S"]) + + def set_annotations(self, docs, batch_tag_ids, **_): + if isinstance(docs, Doc): + docs = [docs] + cdef Doc doc + for i, doc in enumerate(docs): + doc_tag_ids = batch_tag_ids[i] + if hasattr(doc_tag_ids, "get"): + doc_tag_ids = doc_tag_ids.get() + for j, tag_id in enumerate(doc_tag_ids): + # Don't clobber existing sentence boundaries + if doc.c[j].sent_start == 0: + if tag_id == 1: + doc.c[j].sent_start = 1 + else: + doc.c[j].sent_start = -1 + + def update(self, examples, drop=0., sgd=None, losses=None): + self.require_model() + examples = Example.to_example_objects(examples) + if losses is not None and self.name not in losses: + losses[self.name] = 0. + + if not any(len(ex.doc) if ex.doc else 0 for ex in examples): + # Handle cases where there are no tokens in any docs. + return + + tag_scores, bp_tag_scores = self.model.begin_update([ex.doc for ex in examples], drop=drop) + loss, d_tag_scores = self.get_loss(examples, tag_scores) + bp_tag_scores(d_tag_scores, sgd=sgd) + + if losses is not None: + losses[self.name] += loss + + def get_loss(self, examples, scores): + scores = self.model.ops.flatten(scores) + tag_index = range(len(self.labels)) + cdef int idx = 0 + correct = numpy.zeros((scores.shape[0],), dtype="i") + guesses = scores.argmax(axis=1) + known_labels = numpy.ones((scores.shape[0], 1), dtype="f") + for ex in examples: + gold = ex.gold + for sent_start in gold.sent_starts: + if sent_start is None: + correct[idx] = guesses[idx] + elif sent_start in tag_index: + correct[idx] = sent_start + else: + correct[idx] = 0 + known_labels[idx] = 0. + idx += 1 + correct = self.model.ops.xp.array(correct, dtype="i") + d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) + d_scores *= self.model.ops.asarray(known_labels) + loss = (d_scores**2).sum() + docs = [ex.doc for ex in examples] + d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs]) + return float(loss), d_scores + + def begin_training(self, get_examples=lambda: [], pipeline=None, sgd=None, + **kwargs): + cdef Vocab vocab = self.vocab + if self.model is True: + for hp in ["token_vector_width", "conv_depth"]: + if hp in kwargs: + self.cfg[hp] = kwargs[hp] + self.model = self.Model(len(self.labels), **self.cfg) + if sgd is None: + sgd = self.create_optimizer() + return sgd + + @classmethod + def Model(cls, n_tags, **cfg): + return build_tagger_model(n_tags, **cfg) + + def add_label(self, label, values=None): + raise NotImplementedError + + def use_params(self, params): + with self.model.use_params(params): + yield + + def to_bytes(self, exclude=tuple(), **kwargs): + serialize = OrderedDict() + if self.model not in (None, True, False): + serialize["model"] = self.model.to_bytes + serialize["vocab"] = self.vocab.to_bytes + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + def load_model(b): + if self.model is True: + self.model = self.Model(len(self.labels), **self.cfg) + try: + self.model.from_bytes(b) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = OrderedDict(( + ("vocab", lambda b: self.vocab.from_bytes(b)), + ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), + ("model", lambda b: load_model(b)), + )) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + serialize = OrderedDict(( + ("vocab", lambda p: self.vocab.to_disk(p)), + ("model", lambda p: p.open("wb").write(self.model.to_bytes())), + ("cfg", lambda p: srsly.write_json(p, self.cfg)) + )) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude=tuple(), **kwargs): + def load_model(p): + if self.model is True: + self.model = self.Model(len(self.labels), **self.cfg) + with p.open("rb") as file_: + try: + self.model.from_bytes(file_.read()) + except AttributeError: + raise ValueError(Errors.E149) + + deserialize = OrderedDict(( + ("cfg", lambda p: self.cfg.update(_load_cfg(p))), + ("vocab", lambda p: self.vocab.from_disk(p)), + ("model", load_model), + )) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_disk(path, deserialize, exclude) + return self + + @component("nn_labeller") class MultitaskObjective(Tagger): """Experimental: Assist training of a parser or tagger, by training a @@ -1589,4 +1752,4 @@ Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg) -__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"] +__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer", "SentenceRecognizer"] diff --git a/spacy/scorer.py b/spacy/scorer.py index 723259acd..d2878da1a 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -84,6 +84,7 @@ class Scorer(object): self.labelled = PRFScore() self.labelled_per_dep = dict() self.tags = PRFScore() + self.sent_starts = PRFScore() self.ner = PRFScore() self.ner_per_ents = dict() self.eval_punct = eval_punct @@ -113,6 +114,27 @@ class Scorer(object): """ return self.tags.fscore * 100 + @property + def sent_p(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.precision * 100 + + @property + def sent_r(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.recall * 100 + + @property + def sent_f(self): + """RETURNS (float): F-score for identification of sentence starts. + i.e. `Token.is_sent_start`). + """ + return self.sent_starts.fscore * 100 + @property def token_acc(self): """RETURNS (float): Tokenization accuracy.""" @@ -212,6 +234,9 @@ class Scorer(object): "ents_f": self.ents_f, "ents_per_type": self.ents_per_type, "tags_acc": self.tags_acc, + "sent_p": self.sent_p, + "sent_r": self.sent_r, + "sent_f": self.sent_f, "token_acc": self.token_acc, "textcat_score": self.textcat_score, "textcats_per_cat": self.textcats_per_cat, @@ -242,9 +267,12 @@ class Scorer(object): gold_deps = set() gold_deps_per_dep = {} gold_tags = set() + gold_sent_starts = set() gold_ents = set(tags_to_entities(orig.entities)) - for id_, tag, head, dep in zip(orig.ids, orig.tags, orig.heads, orig.deps): + for id_, tag, head, dep, sent_start in zip(orig.ids, orig.tags, orig.heads, orig.deps, orig.sent_starts): gold_tags.add((id_, tag)) + if sent_start: + gold_sent_starts.add(id_) if dep not in (None, "") and dep.lower() not in punct_labels: gold_deps.add((id_, head, dep.lower())) if dep.lower() not in self.labelled_per_dep: @@ -255,6 +283,7 @@ class Scorer(object): cand_deps = set() cand_deps_per_dep = {} cand_tags = set() + cand_sent_starts = set() for token in doc: if token.orth_.isspace(): continue @@ -264,6 +293,8 @@ class Scorer(object): else: self.tokens.tp += 1 cand_tags.add((gold_i, token.tag_)) + if token.is_sent_start: + cand_sent_starts.add(gold_i) if token.dep_.lower() not in punct_labels and token.orth_.strip(): gold_head = gold.cand_to_gold[token.head.i] # None is indistinct, so we can't just add it to the set @@ -308,6 +339,7 @@ class Scorer(object): # Score for all ents self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) + self.sent_starts.score_set(cand_sent_starts, gold_sent_starts) self.labelled.score_set(cand_deps, gold_deps) for dep in self.labelled_per_dep: self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())) diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index efa7ef625..797fa95f8 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer -from spacy.pipeline import Tensorizer, TextCategorizer +from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer from ..util import make_tempdir @@ -144,3 +144,10 @@ def test_serialize_pipe_exclude(en_vocab, Parser): parser.to_bytes(cfg=False, exclude=["vocab"]) with pytest.raises(ValueError): get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False) + + +def test_serialize_sentencerecognizer(en_vocab): + sr = SentenceRecognizer(en_vocab) + sr_b = sr.to_bytes() + sr_d = SentenceRecognizer(en_vocab).from_bytes(sr_b) + assert sr.to_bytes() == sr_d.to_bytes()