2018-12-02 03:26:26 +00:00
|
|
|
"""Example of adding a pipeline component to prohibit sentence boundaries
|
2018-08-05 11:33:52 +00:00
|
|
|
before certain tokens.
|
|
|
|
|
|
|
|
What we do is write to the token.is_sent_start attribute, which
|
|
|
|
takes values in {True, False, None}. The default value None allows the parser
|
|
|
|
to predict sentence segments. The value False prohibits the parser from inserting
|
|
|
|
a sentence boundary before that token. Note that fixing the sentence segmentation
|
|
|
|
should also improve the parse quality.
|
|
|
|
|
|
|
|
The specific example here is drawn from https://github.com/explosion/spaCy/issues/2627
|
|
|
|
Other versions of the model may not make the original mistake, so the specific
|
|
|
|
example might not be apt for future versions.
|
2019-03-16 13:15:49 +00:00
|
|
|
|
|
|
|
Compatible with: spaCy v2.0.0+
|
|
|
|
Last tested with: v2.1.0
|
2018-12-02 03:26:26 +00:00
|
|
|
"""
|
2018-08-05 11:33:52 +00:00
|
|
|
import plac
|
|
|
|
import spacy
|
|
|
|
|
2018-12-02 03:26:26 +00:00
|
|
|
|
2018-08-05 11:33:52 +00:00
|
|
|
def prevent_sentence_boundaries(doc):
|
|
|
|
for token in doc:
|
|
|
|
if not can_be_sentence_start(token):
|
|
|
|
token.is_sent_start = False
|
|
|
|
return doc
|
|
|
|
|
2018-12-02 03:26:26 +00:00
|
|
|
|
2018-08-05 11:33:52 +00:00
|
|
|
def can_be_sentence_start(token):
|
|
|
|
if token.i == 0:
|
|
|
|
return True
|
2019-03-16 13:15:49 +00:00
|
|
|
# We're not checking for is_title here to ignore arbitrary titlecased
|
|
|
|
# tokens within sentences
|
|
|
|
# elif token.is_title:
|
|
|
|
# return True
|
2018-08-05 11:33:52 +00:00
|
|
|
elif token.nbor(-1).is_punct:
|
|
|
|
return True
|
|
|
|
elif token.nbor(-1).is_space:
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2018-12-02 03:26:26 +00:00
|
|
|
|
2019-03-16 13:15:49 +00:00
|
|
|
@plac.annotations(
|
|
|
|
text=("The raw text to process", "positional", None, str),
|
|
|
|
spacy_model=("spaCy model to use (with a parser)", "option", "m", str),
|
|
|
|
)
|
|
|
|
def main(text="Been here And I'm loving it.", spacy_model="en_core_web_lg"):
|
|
|
|
print("Using spaCy model '{}'".format(spacy_model))
|
|
|
|
print("Processing text '{}'".format(text))
|
|
|
|
nlp = spacy.load(spacy_model)
|
|
|
|
doc = nlp(text)
|
|
|
|
sentences = [sent.text.strip() for sent in doc.sents]
|
|
|
|
print("Before:", sentences)
|
2018-12-02 03:26:26 +00:00
|
|
|
nlp.add_pipe(prevent_sentence_boundaries, before="parser")
|
2019-03-16 13:15:49 +00:00
|
|
|
doc = nlp(text)
|
|
|
|
sentences = [sent.text.strip() for sent in doc.sents]
|
|
|
|
print("After:", sentences)
|
2018-12-02 03:26:26 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
2018-08-05 11:33:52 +00:00
|
|
|
plac.call(main)
|