diff --git a/maint/scripts/custom_fixers/fix_unicode_literal.py b/maint/scripts/custom_fixers/fix_unicode_literal.py new file mode 100644 index 00000000..cfc6b3ad --- /dev/null +++ b/maint/scripts/custom_fixers/fix_unicode_literal.py @@ -0,0 +1,17 @@ +import re +from lib2to3.pgen2 import token +from lib2to3 import fixer_base +from lib2to3.fixer_util import Name, Call + +_literal_re = re.compile(ur"[uU][rR]?[\'\"]") + +class FixUnicodeLiteral(fixer_base.BaseFix): + BM_compatible = True + PATTERN = """STRING""" + + def transform(self, node, results): + if node.type == token.STRING and _literal_re.match(node.value): + new = node.clone() + new.value = new.value[1:] + new.prefix = '' + node.replace(Call(Name(u'u', prefix=node.prefix), [new])) diff --git a/tornado/test/util_test.py b/tornado/test/util_test.py index 581c4d81..aa1bf09b 100644 --- a/tornado/test/util_test.py +++ b/tornado/test/util_test.py @@ -1,7 +1,9 @@ +# coding: utf-8 from __future__ import absolute_import, division, with_statement import sys -from tornado.util import raise_exc_info, Configurable +from tornado.escape import utf8 +from tornado.util import raise_exc_info, Configurable, u, b from tornado.test.util import unittest @@ -112,3 +114,8 @@ class ConfigurableTest(unittest.TestCase): # args bound in configure don't apply when using the subclass directly obj = TestConfig2() self.assertIs(obj.b, None) + + +class UnicodeLiteralTest(unittest.TestCase): + def test_unicode_escapes(self): + self.assertEqual(utf8(u('\u00e9')), b('\xc3\xa9')) diff --git a/tornado/util.py b/tornado/util.py index f550449a..0dda1650 100644 --- a/tornado/util.py +++ b/tornado/util.py @@ -68,13 +68,23 @@ def import_object(name): # to convert our string literals. b() should only be applied to literal # latin1 strings. Once we drop support for 2.5, we can remove this function # and just use byte literals. +# +# Fake unicode literal support: Python 3.2 doesn't have the u'' marker for +# literal strings, and alternative solutions like "from __future__ import +# unicode_literals" have other problems (see PEP 414). u() can be applied +# to ascii strings that include \u escapes (but they must not contain +# literal non-ascii characters). if str is unicode: def b(s): return s.encode('latin1') + def u(s): + return s bytes_type = bytes else: def b(s): return s + def u(s): + return s.decode('unicode_escape') bytes_type = str