mirror of https://github.com/explosion/spaCy.git
Merge pull request #5148 from svlandeg/fix/empty-docbin
Fix serialization of empty doc
This commit is contained in:
commit
3944c1a65d
|
@ -0,0 +1,11 @@
|
|||
from spacy.tokens import DocBin
|
||||
|
||||
|
||||
def test_issue5141(en_vocab):
|
||||
""" Ensure an empty DocBin does not crash on serialization """
|
||||
doc_bin = DocBin(attrs=["DEP", "HEAD"])
|
||||
assert list(doc_bin.get_docs(en_vocab)) == []
|
||||
doc_bin_bytes = doc_bin.to_bytes()
|
||||
|
||||
doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
|
||||
assert list(doc_bin_2.get_docs(en_vocab)) == []
|
|
@ -135,10 +135,13 @@ class DocBin(object):
|
|||
for tokens in self.tokens:
|
||||
assert len(tokens.shape) == 2, tokens.shape # this should never happen
|
||||
lengths = [len(tokens) for tokens in self.tokens]
|
||||
tokens = numpy.vstack(self.tokens) if self.tokens else numpy.asarray([])
|
||||
spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
|
||||
|
||||
msg = {
|
||||
"attrs": self.attrs,
|
||||
"tokens": numpy.vstack(self.tokens).tobytes("C"),
|
||||
"spaces": numpy.vstack(self.spaces).tobytes("C"),
|
||||
"tokens": tokens.tobytes("C"),
|
||||
"spaces": spaces.tobytes("C"),
|
||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||
"strings": list(self.strings),
|
||||
"cats": self.cats,
|
||||
|
|
Loading…
Reference in New Issue