displaCy Spans Vertical Alignment Fix 2 (#11092)

* add in span render slot fix * fix spacing off by one * rm demo * adjust comments * fix whitespace and overlap issue
2022-07-08 13:20:13 -04:00 · 2022-07-08 13:20:13 -04:00 · 36cb2029a9
parent dc38a0f079
commit 36cb2029a9
1 changed files with 53 additions and 8 deletions
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -130,26 +130,56 @@ class SpanRenderer:
        title (str / None): Document title set in Doc.user_data['title'].
        """
        per_token_info = []
        # we must sort so that we can correctly describe when spans need to "stack"
        # which is determined by their start token, then span length (longer spans on top),
        # then break any remaining ties with the span label
        spans = sorted(
            spans,
            key=lambda s: (
                s["start_token"],
                -(s["end_token"] - s["start_token"]),
                s["label"],
            ),
        )
        for s in spans:
            # this is the vertical 'slot' that the span will be rendered in
            # vertical_position = span_label_offset + (offset_step * (slot - 1))
            s["render_slot"] = 0
        for idx, token in enumerate(tokens):
            # Identify if a token belongs to a Span (and which) and if it's a
            # start token of said Span. We'll use this for the final HTML render
            token_markup: Dict[str, Any] = {}
            token_markup["text"] = token
            concurrent_spans = 0
            entities = []
            for span in spans:
                ent = {}
                if span["start_token"] <= idx < span["end_token"]:
                    concurrent_spans += 1
                    span_start = idx == span["start_token"]
                    ent["label"] = span["label"]
-                    ent["is_start"] = True if idx == span["start_token"] else False
+                    ent["is_start"] = span_start
                    if span_start:
                        # When the span starts, we need to know how many other
                        # spans are on the 'span stack' and will be rendered.
                        # This value becomes the vertical render slot for this entire span
                        span["render_slot"] = concurrent_spans
                    ent["render_slot"] = span["render_slot"]
                    kb_id = span.get("kb_id", "")
                    kb_url = span.get("kb_url", "#")
                    ent["kb_link"] = (
                        TPL_KB_LINK.format(kb_id=kb_id, kb_url=kb_url) if kb_id else ""
                    )
                    entities.append(ent)
                else:
                    # We don't specifically need to do this since we loop
                    # over tokens and spans sorted by their start_token,
                    # so we'll never use a span again after the last token it appears in,
                    # but if we were to use these spans again we'd want to make sure
                    # this value was reset correctly.
                    span["render_slot"] = 0
            token_markup["entities"] = entities
            per_token_info.append(token_markup)
        markup = self._render_markup(per_token_info)
        markup = TPL_SPANS.format(content=markup, dir=self.direction)
        if title:
@ -160,8 +190,12 @@ class SpanRenderer:
        """Render the markup from per-token information"""
        markup = ""
        for token in per_token_info:
-            entities = sorted(token["entities"], key=lambda d: d["label"])
+            entities = sorted(token["entities"], key=lambda d: d["render_slot"])
-            if entities:
+            # Whitespace tokens disrupt the vertical space (no line height) so that the
            # span indicators get misaligned. We don't render them as individual
            # tokens anyway, so we'll just not display a span indicator either.
            is_whitespace = token["text"].strip() == ""
            if entities and not is_whitespace:
                slices = self._get_span_slices(token["entities"])
                starts = self._get_span_starts(token["entities"])
                total_height = (
@ -182,10 +216,18 @@ class SpanRenderer:
    def _get_span_slices(self, entities: List[Dict]) -> str:
        """Get the rendered markup of all Span slices"""
        span_slices = []
-        for entity, step in zip(entities, itertools.count(step=self.offset_step)):
+        for entity in entities:
            # rather than iterate over multiples of offset_step, we use entity['render_slot']
            # to determine the vertical position, since that tells where
            # the span starts vertically so we can extend it horizontally,
            # past other spans that might have already ended
            color = self.colors.get(entity["label"].upper(), self.default_color)
            top_offset = self.top_offset + (
                self.offset_step * (entity["render_slot"] - 1)
            )
            span_slice = self.span_slice_template.format(
-                bg=color, top_offset=self.top_offset + step
+                bg=color,
                top_offset=top_offset,
            )
            span_slices.append(span_slice)
        return "".join(span_slices)
@ -193,12 +235,15 @@ class SpanRenderer:
    def _get_span_starts(self, entities: List[Dict]) -> str:
        """Get the rendered markup of all Span start tokens"""
        span_starts = []
-        for entity, step in zip(entities, itertools.count(step=self.offset_step)):
+        for entity in entities:
            color = self.colors.get(entity["label"].upper(), self.default_color)
            top_offset = self.top_offset + (
                self.offset_step * (entity["render_slot"] - 1)
            )
            span_start = (
                self.span_start_template.format(
                    bg=color,
-                    top_offset=self.top_offset + step,
+                    top_offset=top_offset,
                    label=entity["label"],
                    kb_link=entity["kb_link"],
                )