From 598ffdace41c4ed96eceb98cfcd27388d2f11657 Mon Sep 17 00:00:00 2001
From: szweep <szweep@gmail.com>
Date: Thu, 23 Apr 2015 14:33:17 -0400
Subject: [PATCH] Modify xhtml_unescape to handle hex numeric refs

While the existing code handles references like &#32; properly,
it doesn't handle the valid case where the number is specified
in hex. For example 'foo&#32;bar', 'foo&#x20;bar' and 'foo&#X20;bar'
should all decode to 'foo bar'. The changes in escape.py check
for the 'x' and decode appropriately. Also added unit tests for
unescaping strings with numeric references.
---
 tornado/escape.py           |  5 ++++-
 tornado/test/escape_test.py | 13 +++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tornado/escape.py b/tornado/escape.py
index 2852cf51..2f04b468 100644
--- a/tornado/escape.py
+++ b/tornado/escape.py
@@ -378,7 +378,10 @@ def linkify(text, shorten=False, extra_params="",
 def _convert_entity(m):
     if m.group(1) == "#":
         try:
-            return unichr(int(m.group(2)))
+            if m.group(2)[:1].lower() == 'x':
+                return unichr(int(m.group(2)[1:], 16))
+            else:
+                return unichr(int(m.group(2)))
         except ValueError:
             return "&#%s;" % m.group(2)
     try:
diff --git a/tornado/test/escape_test.py b/tornado/test/escape_test.py
index 98a23463..65765b68 100644
--- a/tornado/test/escape_test.py
+++ b/tornado/test/escape_test.py
@@ -154,6 +154,19 @@ class EscapeTestCase(unittest.TestCase):
             self.assertEqual(utf8(xhtml_escape(unescaped)), utf8(escaped))
             self.assertEqual(utf8(unescaped), utf8(xhtml_unescape(escaped)))
 
+    def test_xhtml_unescape_numeric(self):
+        tests = [
+            ('foo&#32;bar', 'foo bar'),
+            ('foo&#x20;bar', 'foo bar'),
+            ('foo&#X20;bar', 'foo bar'),
+            ('foo&#xabc;bar', u('foo\u0abcbar')),
+            ('foo&#xyz;bar', 'foo&#xyz;bar'),  # invalid encoding
+            ('foo&#;bar', 'foo&#;bar'),        # invalid encoding
+            ('foo&#x;bar', 'foo&#x;bar'),      # invalid encoding
+        ]
+        for escaped, unescaped in tests:
+            self.assertEqual(unescaped, xhtml_unescape(escaped))
+
     def test_url_escape_unicode(self):
         tests = [
             # byte strings are passed through as-is