From f762d52b2419e1388229d64753f40fc5351ced39 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Aug 2018 13:33:52 +0200 Subject: [PATCH] Add example for Issue #2627 --- .../pipeline/custom_sentence_segmentation.py | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 examples/pipeline/custom_sentence_segmentation.py diff --git a/examples/pipeline/custom_sentence_segmentation.py b/examples/pipeline/custom_sentence_segmentation.py new file mode 100644 index 000000000..d4f8aee63 --- /dev/null +++ b/examples/pipeline/custom_sentence_segmentation.py @@ -0,0 +1,48 @@ +'''Example of adding a pipeline component to prohibit sentence boundaries +before certain tokens. + +What we do is write to the token.is_sent_start attribute, which +takes values in {True, False, None}. The default value None allows the parser +to predict sentence segments. The value False prohibits the parser from inserting +a sentence boundary before that token. Note that fixing the sentence segmentation +should also improve the parse quality. + +The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627 +Other versions of the model may not make the original mistake, so the specific +example might not be apt for future versions. +''' +import plac +import spacy + +def prevent_sentence_boundaries(doc): + for token in doc: + if not can_be_sentence_start(token): + token.is_sent_start = False + return doc + +def can_be_sentence_start(token): + if token.i == 0: + return True + elif token.is_title: + return True + elif token.nbor(-1).is_punct: + return True + elif token.nbor(-1).is_space: + return True + else: + return False + +def main(): + nlp = spacy.load('en_core_web_lg') + raw_text = "Been here and I'm loving it." + doc = nlp(raw_text) + sentences = [sent.string.strip() for sent in doc.sents] + print(sentences) + nlp.add_pipe(prevent_sentence_boundaries, before='parser') + doc = nlp(raw_text) + sentences = [sent.string.strip() for sent in doc.sents] + print(sentences) + + +if __name__ == '__main__': + plac.call(main)