Merge pull request #5485 from adrianeboyd/bugfix/retokenizer-merge-0-length-5450

Disallow merging 0-length spans
This commit is contained in:
Matthew Honnibal 2020-05-22 13:28:35 +02:00 committed by GitHub
commit 8cb16c7120
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 10 additions and 0 deletions

View File

@ -567,6 +567,7 @@ class Errors(object):
E197 = ("Row out of bounds, unable to add row {row} for key {key}.") E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
E198 = ("Unable to return {n} most similar vectors for the current vectors " E198 = ("Unable to return {n} most similar vectors for the current vectors "
"table, which contains {n_rows} vectors.") "table, which contains {n_rows} vectors.")
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
@add_codes @add_codes

View File

@ -425,3 +425,10 @@ def test_retokenize_skip_duplicates(en_vocab):
retokenizer.merge(doc[0:2]) retokenizer.merge(doc[0:2])
assert len(doc) == 2 assert len(doc) == 2
assert doc[0].text == "hello world" assert doc[0].text == "hello world"
def test_retokenize_disallow_zero_length(en_vocab):
doc = Doc(en_vocab, words=["hello", "world", "!"])
with pytest.raises(ValueError):
with doc.retokenize() as retokenizer:
retokenizer.merge(doc[1:1])

View File

@ -55,6 +55,8 @@ cdef class Retokenizer:
""" """
if (span.start, span.end) in self._spans_to_merge: if (span.start, span.end) in self._spans_to_merge:
return return
if span.end - span.start <= 0:
raise ValueError(Errors.E199.format(start=span.start, end=span.end))
for token in span: for token in span:
if token.i in self.tokens_to_merge: if token.i in self.tokens_to_merge:
raise ValueError(Errors.E102.format(token=repr(token))) raise ValueError(Errors.E102.format(token=repr(token)))