🐛 Escape annotated HTML tags in span renderer (#12817)

These changes add a missing call to `escape_html` in the displaCy span
renderer. Previously span-annotated tokens would be inserted into the
page markup without being escaped, resulting in potentially incorrect
rendering. When I encountered this issue, it resulted in some docs and
span underlines being superimposed on top of properly rendered docs and
span underlines near the beginning of the visualization (due to an
unescaped `<span>` tag).
This commit is contained in:
Connor Brinton 2023-07-13 11:33:05 -04:00 committed by GitHub
parent ddffd09602
commit 0566c3a166
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 2 deletions

View File

@ -1,4 +1,3 @@
import itertools
import uuid
from typing import Any, Dict, List, Optional, Tuple, Union
@ -218,7 +217,7 @@ class SpanRenderer:
+ (self.offset_step * (len(entities) - 1))
)
markup += self.span_template.format(
text=token["text"],
text=escape_html(token["text"]),
span_slices=slices,
span_starts=starts,
total_height=total_height,

View File

@ -377,3 +377,22 @@ def test_displacy_manual_sorted_entities():
html = displacy.render(doc, style="ent", manual=True)
assert html.find("FIRST") < html.find("SECOND")
@pytest.mark.issue(12816)
def test_issue12816(en_vocab) -> None:
"""Test that displaCy's span visualizer escapes annotated HTML tags correctly."""
# Create a doc containing an annotated word and an unannotated HTML tag
doc = Doc(en_vocab, words=["test", "<TEST>"])
doc.spans["sc"] = [Span(doc, 0, 1, label="test")]
# Verify that the HTML tag is escaped when unannotated
html = displacy.render(doc, style="span")
assert "&lt;TEST&gt;" in html
# Annotate the HTML tag
doc.spans["sc"].append(Span(doc, 1, 2, label="test"))
# Verify that the HTML tag is still escaped
html = displacy.render(doc, style="span")
assert "&lt;TEST&gt;" in html