From 4bd2688eac26842576bab8918bbf610963e642d3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 11 Mar 2019 18:52:50 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Fix=20displaCy=20support=20for?= =?UTF-8?q?=20RTL=20languages=20(#3393)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes #2091. ## Description With the new `vocab.writing_system` property introduced in #3390 (exposed via the language defaults), I was able to finally fix this (I think!). Based on the `Doc`, dispaCy now detects whether it's a RTL or LTR language and adjusts the visualization accordingly. Wherever possible, I've also added `direction` and `lang` attributes. Entity visualization now looks like this: Screenshot 2019-03-11 at 16 06 51 And dependencies like this (ignore the most likely incorrect tags and dependencies): Screenshot 2019-03-11 at 16 51 59 ### Types of change enhancement, bug fix ## Checklist - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. --- spacy/displacy/__init__.py | 12 ++++++-- spacy/displacy/render.py | 58 +++++++++++++++++++++++++++--------- spacy/displacy/templates.py | 17 +++++++---- spacy/tests/test_displacy.py | 20 +++++++++++++ 4 files changed, 86 insertions(+), 21 deletions(-) diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index 6c5509b14..fadbaaa7e 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -161,7 +161,7 @@ def parse_deps(orig_doc, options={}): "dir": "right", } ) - return {"words": words, "arcs": arcs} + return {"words": words, "arcs": arcs, "settings": get_doc_settings(orig_doc)} def parse_ents(doc, options={}): @@ -177,7 +177,8 @@ def parse_ents(doc, options={}): if not ents: user_warning(Warnings.W006) title = doc.user_data.get("title", None) if hasattr(doc, "user_data") else None - return {"text": doc.text, "ents": ents, "title": title} + settings = get_doc_settings(doc) + return {"text": doc.text, "ents": ents, "title": title, "settings": settings} def set_render_wrapper(func): @@ -195,3 +196,10 @@ def set_render_wrapper(func): if not hasattr(func, "__call__"): raise ValueError(Errors.E110.format(obj=type(func))) RENDER_WRAPPER = func + + +def get_doc_settings(doc): + return { + "lang": doc.lang_, + "direction": doc.vocab.writing_system.get("direction", "ltr"), + } diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 2b8e0c7d2..500e49989 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import uuid -from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS -from .templates import TPL_ENT, TPL_ENTS, TPL_FIGURE, TPL_TITLE, TPL_PAGE +from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS +from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE from ..util import minify_html, escape_html +DEFAULT_LANG = "en" +DEFAULT_DIR = "ltr" + class DependencyRenderer(object): """Render dependency parses as SVGs.""" @@ -30,6 +33,8 @@ class DependencyRenderer(object): self.color = options.get("color", "#000000") self.bg = options.get("bg", "#ffffff") self.font = options.get("font", "Arial") + self.direction = DEFAULT_DIR + self.lang = DEFAULT_LANG def render(self, parsed, page=False, minify=False): """Render complete markup. @@ -42,13 +47,19 @@ class DependencyRenderer(object): # Create a random ID prefix to make sure parses don't receive the # same ID, even if they're identical id_prefix = uuid.uuid4().hex - rendered = [ - self.render_svg("{}-{}".format(id_prefix, i), p["words"], p["arcs"]) - for i, p in enumerate(parsed) - ] + rendered = [] + for i, p in enumerate(parsed): + if i == 0: + self.direction = p["settings"].get("direction", DEFAULT_DIR) + self.lang = p["settings"].get("lang", DEFAULT_LANG) + render_id = "{}-{}".format(id_prefix, i) + svg = self.render_svg(render_id, p["words"], p["arcs"]) + rendered.append(svg) if page: content = "".join([TPL_FIGURE.format(content=svg) for svg in rendered]) - markup = TPL_PAGE.format(content=content) + markup = TPL_PAGE.format( + content=content, lang=self.lang, dir=self.direction + ) else: markup = "".join(rendered) if minify: @@ -83,6 +94,8 @@ class DependencyRenderer(object): bg=self.bg, font=self.font, content=content, + dir=self.direction, + lang=self.lang, ) def render_word(self, text, tag, i): @@ -95,11 +108,13 @@ class DependencyRenderer(object): """ y = self.offset_y + self.word_spacing x = self.offset_x + i * self.distance + if self.direction == "rtl": + x = self.width - x html_text = escape_html(text) return TPL_DEP_WORDS.format(text=html_text, tag=tag, x=x, y=y) def render_arrow(self, label, start, end, direction, i): - """Render indivicual arrow. + """Render individual arrow. label (unicode): Dependency label. start (int): Index of start word. @@ -110,6 +125,8 @@ class DependencyRenderer(object): """ level = self.levels.index(end - start) + 1 x_start = self.offset_x + start * self.distance + self.arrow_spacing + if self.direction == "rtl": + x_start = self.width - x_start y = self.offset_y x_end = ( self.offset_x @@ -117,6 +134,8 @@ class DependencyRenderer(object): + start * self.distance - self.arrow_spacing * (self.highest_level - level) / 4 ) + if self.direction == "rtl": + x_end = self.width - x_end y_curve = self.offset_y - level * self.distance / 2 if self.compact: y_curve = self.offset_y - level * self.distance / 6 @@ -124,12 +143,14 @@ class DependencyRenderer(object): y_curve = -self.distance arrowhead = self.get_arrowhead(direction, x_start, y, x_end) arc = self.get_arc(x_start, y, y_curve, x_end) + label_side = "right" if self.direction == "rtl" else "left" return TPL_DEP_ARCS.format( id=self.id, i=i, stroke=self.arrow_stroke, head=arrowhead, label=label, + label_side=label_side, arc=arc, ) @@ -219,6 +240,8 @@ class EntityRenderer(object): self.default_color = "#ddd" self.colors = colors self.ents = options.get("ents", None) + self.direction = DEFAULT_DIR + self.lang = DEFAULT_LANG def render(self, parsed, page=False, minify=False): """Render complete markup. @@ -228,12 +251,15 @@ class EntityRenderer(object): minify (bool): Minify HTML markup. RETURNS (unicode): Rendered HTML markup. """ - rendered = [ - self.render_ents(p["text"], p["ents"], p.get("title", None)) for p in parsed - ] + rendered = [] + for i, p in enumerate(parsed): + if i == 0: + self.direction = p["settings"].get("direction", DEFAULT_DIR) + self.lang = p["settings"].get("lang", DEFAULT_LANG) + rendered.append(self.render_ents(p["text"], p["ents"], p["title"])) if page: docs = "".join([TPL_FIGURE.format(content=doc) for doc in rendered]) - markup = TPL_PAGE.format(content=docs) + markup = TPL_PAGE.format(content=docs, lang=self.lang, dir=self.direction) else: markup = "".join(rendered) if minify: @@ -261,12 +287,16 @@ class EntityRenderer(object): markup += "
" if self.ents is None or label.upper() in self.ents: color = self.colors.get(label.upper(), self.default_color) - markup += TPL_ENT.format(label=label, text=entity, bg=color) + ent_settings = {"label": label, "text": entity, "bg": color} + if self.direction == "rtl": + markup += TPL_ENT_RTL.format(**ent_settings) + else: + markup += TPL_ENT.format(**ent_settings) else: markup += entity offset = end markup += escape_html(text[offset:]) - markup = TPL_ENTS.format(content=markup, colors=self.colors) + markup = TPL_ENTS.format(content=markup, dir=self.direction) if title: markup = TPL_TITLE.format(title=title) + markup return markup diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index f0922b1e3..4a7c596d8 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -6,7 +6,7 @@ from __future__ import unicode_literals # Jupyter to render it properly in a cell TPL_DEP_SVG = """ -{content} +{content} """ @@ -22,7 +22,7 @@ TPL_DEP_ARCS = """ - {label} + {label} @@ -39,7 +39,7 @@ TPL_TITLE = """ TPL_ENTS = """ -
{content}
+
{content}
""" @@ -50,14 +50,21 @@ TPL_ENT = """ """ +TPL_ENT_RTL = """ + + {text} + {label} + +""" + TPL_PAGE = """ - + displaCy - {content} + {content} """ diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index a65060ea7..24e45bfc1 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest from spacy import displacy from spacy.tokens import Span +from spacy.lang.fa import Persian from .util import get_doc @@ -66,3 +67,22 @@ def test_displacy_render_wrapper(en_vocab): def test_displacy_raises_for_wrong_type(en_vocab): with pytest.raises(ValueError): displacy.render("hello world") + + +def test_displacy_rtl(): + # Source: http://www.sobhe.ir/hazm/ – is this correct? + words = ["ما", "بسیار", "کتاب", "می\u200cخوانیم"] + # These are (likely) wrong, but it's just for testing + pos = ["PRO", "ADV", "N_PL", "V_SUB"] # needs to match lang.fa.tag_map + deps = ["foo", "bar", "foo", "baz"] + heads = [1, 0, 1, -2] + nlp = Persian() + doc = get_doc(nlp.vocab, words=words, pos=pos, tags=pos, heads=heads, deps=deps) + doc.ents = [Span(doc, 1, 3, label="TEST")] + html = displacy.render(doc, page=True, style="dep") + assert "direction: rtl" in html + assert 'direction="rtl"' in html + assert 'lang="{}"'.format(nlp.lang) in html + html = displacy.render(doc, page=True, style="ent") + assert "direction: rtl" in html + assert 'lang="{}"'.format(nlp.lang) in html