Miscellaneous Minor SpanGroups/DocBin Improvements (#10250)

* MultiHashEmbed vector docs correction

* doc copy span test

* ignore empty lists in DocBin.span_groups

* serialized empty list const + SpanGroups.is_empty

* add conditional deserial on from_bytes

* clean up + reorganize

* rm test

* add constant as class attribute

* rename to _EMPTY_BYTES

* Update spacy/tests/doc/test_span.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
Peter Baumgartner 2022-02-21 04:24:15 -05:00 committed by GitHub
parent f4c74764b8
commit 3358fb9bdd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 2 deletions

View File

@ -655,3 +655,16 @@ def test_span_sents(doc, start, end, expected_sentences, expected_sentences_with
def test_span_sents_not_parsed(doc_not_parsed):
with pytest.raises(ValueError):
list(Span(doc_not_parsed, 0, 3).sents)
def test_span_group_copy(doc):
doc.spans["test"] = [doc[0:1], doc[2:4]]
assert len(doc.spans["test"]) == 2
doc_copy = doc.copy()
# check that the spans were indeed copied
assert len(doc_copy.spans["test"]) == 2
# add a new span to the original doc
doc.spans["test"].append(doc[3:4])
assert len(doc.spans["test"]) == 3
# check that the copy spans were not modified and this is an isolated doc
assert len(doc_copy.spans["test"]) == 2

View File

@ -6,6 +6,7 @@ import srsly
from .span_group import SpanGroup
from ..errors import Errors
if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports
from .doc import Doc # noqa: F401
@ -19,6 +20,8 @@ if TYPE_CHECKING:
class SpanGroups(UserDict):
"""A dict-like proxy held by the Doc, to control access to span groups."""
_EMPTY_BYTES = srsly.msgpack_dumps([])
def __init__(
self, doc: "Doc", items: Iterable[Tuple[str, SpanGroup]] = tuple()
) -> None:
@ -43,11 +46,13 @@ class SpanGroups(UserDict):
def to_bytes(self) -> bytes:
# We don't need to serialize this as a dict, because the groups
# know their names.
if len(self) == 0:
return self._EMPTY_BYTES
msg = [value.to_bytes() for value in self.values()]
return srsly.msgpack_dumps(msg)
def from_bytes(self, bytes_data: bytes) -> "SpanGroups":
msg = srsly.msgpack_loads(bytes_data)
msg = [] if bytes_data == self._EMPTY_BYTES else srsly.msgpack_loads(bytes_data)
self.clear()
doc = self._ensure_doc()
for value_bytes in msg:

View File

@ -12,6 +12,7 @@ from ..compat import copy_reg
from ..attrs import SPACY, ORTH, intify_attr, IDS
from ..errors import Errors
from ..util import ensure_path, SimpleFrozenList
from ._dict_proxies import SpanGroups
# fmt: off
ALL_ATTRS = ("ORTH", "NORM", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "ENT_ID", "LEMMA", "MORPH", "POS", "SENT_START")
@ -146,7 +147,7 @@ class DocBin:
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) # type: ignore
doc = doc.from_array(self.attrs, tokens) # type: ignore
doc.cats = self.cats[i]
if self.span_groups[i]:
if self.span_groups[i] != SpanGroups._EMPTY_BYTES:
doc.spans.from_bytes(self.span_groups[i])
else:
doc.spans.clear()