mirror of https://github.com/explosion/spaCy.git
Re-refactor Sentencizer with Pipe API (#7176)
Reapply the refactoring (#4721) so that `Sentencizer` uses the faster `predict` and `set_annotations` for both `__call__` and `pipe`.
This commit is contained in:
parent
592678fb7d
commit
10c930cc96
|
@ -66,26 +66,12 @@ class Sentencizer(Pipe):
|
||||||
"""
|
"""
|
||||||
error_handler = self.get_error_handler()
|
error_handler = self.get_error_handler()
|
||||||
try:
|
try:
|
||||||
self._call(doc)
|
tags = self.predict([doc])
|
||||||
|
self.set_annotations([doc], tags)
|
||||||
return doc
|
return doc
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_handler(self.name, self, [doc], e)
|
error_handler(self.name, self, [doc], e)
|
||||||
|
|
||||||
def _call(self, doc):
|
|
||||||
start = 0
|
|
||||||
seen_period = False
|
|
||||||
for i, token in enumerate(doc):
|
|
||||||
is_in_punct_chars = token.text in self.punct_chars
|
|
||||||
token.is_sent_start = i == 0
|
|
||||||
if seen_period and not token.is_punct and not is_in_punct_chars:
|
|
||||||
doc[start].is_sent_start = True
|
|
||||||
start = token.i
|
|
||||||
seen_period = False
|
|
||||||
elif is_in_punct_chars:
|
|
||||||
seen_period = True
|
|
||||||
if start < len(doc):
|
|
||||||
doc[start].is_sent_start = True
|
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
"""Apply the pipe to a batch of docs, without modifying them.
|
"""Apply the pipe to a batch of docs, without modifying them.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue