Add tornado.util.u and a fixer script to start using it.

2013-01-13 18:10:01 -05:00 · 2013-01-13 18:10:01 -05:00 · 8b40fafec7
parent 5cb63c9ca8
commit 8b40fafec7
3 changed files with 35 additions and 1 deletions
--- a/maint/scripts/custom_fixers/fix_unicode_literal.py
+++ b/maint/scripts/custom_fixers/fix_unicode_literal.py
@ -0,0 +1,17 @@
+import re
+from lib2to3.pgen2 import token
+from lib2to3 import fixer_base
+from lib2to3.fixer_util import Name, Call
+
+_literal_re = re.compile(ur"[uU][rR]?[\'\"]")
+
+class FixUnicodeLiteral(fixer_base.BaseFix):
+    BM_compatible = True
+    PATTERN = """STRING"""
+
+    def transform(self, node, results):
+        if node.type == token.STRING and _literal_re.match(node.value):
+            new = node.clone()
+            new.value = new.value[1:]
+            new.prefix = ''
+            node.replace(Call(Name(u'u', prefix=node.prefix), [new]))
--- a/tornado/test/util_test.py
+++ b/tornado/test/util_test.py
@ -1,7 +1,9 @@
+# coding: utf-8
 from __future__ import absolute_import, division, with_statement
 import sys

-from tornado.util import raise_exc_info, Configurable
+from tornado.escape import utf8
+from tornado.util import raise_exc_info, Configurable, u, b
 from tornado.test.util import unittest


@ -112,3 +114,8 @@ class ConfigurableTest(unittest.TestCase):
        # args bound in configure don't apply when using the subclass directly
        obj = TestConfig2()
        self.assertIs(obj.b, None)
+
+
+class UnicodeLiteralTest(unittest.TestCase):
+    def test_unicode_escapes(self):
+        self.assertEqual(utf8(u('\u00e9')), b('\xc3\xa9'))
--- a/tornado/util.py
+++ b/tornado/util.py
@ -68,13 +68,23 @@ def import_object(name):
 # to convert our string literals.  b() should only be applied to literal
 # latin1 strings.  Once we drop support for 2.5, we can remove this function
 # and just use byte literals.
+#
+# Fake unicode literal support:  Python 3.2 doesn't have the u'' marker for
+# literal strings, and alternative solutions like "from __future__ import
+# unicode_literals" have other problems (see PEP 414).  u() can be applied
+# to ascii strings that include \u escapes (but they must not contain
+# literal non-ascii characters).
 if str is unicode:
    def b(s):
        return s.encode('latin1')
+    def u(s):
+        return s
    bytes_type = bytes
 else:
    def b(s):
        return s
+    def u(s):
+        return s.decode('unicode_escape')
    bytes_type = str