Merge pull request #5148 from svlandeg/fix/empty-docbin

Fix serialization of empty doc
This commit is contained in:
Ines Montani 2020-03-16 15:03:54 +01:00 committed by GitHub
commit 3944c1a65d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 2 deletions

View File

@ -0,0 +1,11 @@
from spacy.tokens import DocBin
def test_issue5141(en_vocab):
""" Ensure an empty DocBin does not crash on serialization """
doc_bin = DocBin(attrs=["DEP", "HEAD"])
assert list(doc_bin.get_docs(en_vocab)) == []
doc_bin_bytes = doc_bin.to_bytes()
doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
assert list(doc_bin_2.get_docs(en_vocab)) == []

View File

@ -135,10 +135,13 @@ class DocBin(object):
for tokens in self.tokens: for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape # this should never happen assert len(tokens.shape) == 2, tokens.shape # this should never happen
lengths = [len(tokens) for tokens in self.tokens] lengths = [len(tokens) for tokens in self.tokens]
tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
msg = { msg = {
"attrs": self.attrs, "attrs": self.attrs,
"tokens": numpy.vstack(self.tokens).tobytes("C"), "tokens": tokens.tobytes("C"),
"spaces": numpy.vstack(self.spaces).tobytes("C"), "spaces": spaces.tobytes("C"),
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings), "strings": list(self.strings),
"cats": self.cats, "cats": self.cats,