From 598ffdace41c4ed96eceb98cfcd27388d2f11657 Mon Sep 17 00:00:00 2001 From: szweep Date: Thu, 23 Apr 2015 14:33:17 -0400 Subject: [PATCH] Modify xhtml_unescape to handle hex numeric refs While the existing code handles references like properly, it doesn't handle the valid case where the number is specified in hex. For example 'foo bar', 'foo bar' and 'foo bar' should all decode to 'foo bar'. The changes in escape.py check for the 'x' and decode appropriately. Also added unit tests for unescaping strings with numeric references. --- tornado/escape.py | 5 ++++- tornado/test/escape_test.py | 13 +++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/tornado/escape.py b/tornado/escape.py index 2852cf51..2f04b468 100644 --- a/tornado/escape.py +++ b/tornado/escape.py @@ -378,7 +378,10 @@ def linkify(text, shorten=False, extra_params="", def _convert_entity(m): if m.group(1) == "#": try: - return unichr(int(m.group(2))) + if m.group(2)[:1].lower() == 'x': + return unichr(int(m.group(2)[1:], 16)) + else: + return unichr(int(m.group(2))) except ValueError: return "&#%s;" % m.group(2) try: diff --git a/tornado/test/escape_test.py b/tornado/test/escape_test.py index 98a23463..65765b68 100644 --- a/tornado/test/escape_test.py +++ b/tornado/test/escape_test.py @@ -154,6 +154,19 @@ class EscapeTestCase(unittest.TestCase): self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped)) self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped))) + def test_xhtml_unescape_numeric(self): + tests = [ + ('foo bar', 'foo bar'), + ('foo bar', 'foo bar'), + ('foo bar', 'foo bar'), + ('foo઼bar', u('foo\u0abcbar')), + ('foo&#xyz;bar', 'foo&#xyz;bar'), # invalid encoding + ('foo&#;bar', 'foo&#;bar'), # invalid encoding + ('foo&#x;bar', 'foo&#x;bar'), # invalid encoding + ] + for escaped, unescaped in tests: + self.assertEqual(unescaped, xhtml_unescape(escaped)) + def test_url_escape_unicode(self): tests = [ # byte strings are passed through as-is