From f933e1ab6fdea76973384e38ea95520de422c340 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 20 Oct 2010 22:58:25 +0000 Subject: [PATCH] Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment variable is not set, the locale encoding is ISO-8859-1, whereas most programs (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem encoding and to encode command line arguments on this OS. --- Lib/test/test_cmd_line.py | 32 +++++++++++ Misc/NEWS | 6 ++ Modules/python.c | 8 +++ Objects/unicodeobject.c | 114 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 73acb9fdfa9..c864cdd4bb4 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -148,6 +148,38 @@ def test_undecodable_code(self): if not stdout.startswith(pattern): raise AssertionError("%a doesn't start with %a" % (stdout, pattern)) + @unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X') + def test_osx_utf8(self): + def check_output(text): + decoded = text.decode('utf8', 'surrogateescape') + expected = ascii(decoded).encode('ascii') + b'\n' + + env = os.environ.copy() + # C locale gives ASCII locale encoding, but Python uses UTF-8 + # to parse the command line arguments on Mac OS X + env['LC_ALL'] = 'C' + + p = subprocess.Popen( + (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text), + stdout=subprocess.PIPE, + env=env) + stdout, stderr = p.communicate() + self.assertEqual(stdout, expected) + self.assertEqual(p.returncode, 0) + + # test valid utf-8 + text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8') + check_output(text) + + # test invalid utf-8 + text = ( + b'\xff' # invalid byte + b'\xc3\xa9' # valid utf-8 character + b'\xc3\xff' # invalid byte sequence + b'\xed\xa0\x80' # lone surrogate character (invalid) + ) + check_output(text) + def test_unbuffered_output(self): # Test expected operation of the '-u' switch for stream in ('stdout', 'stderr'): diff --git a/Misc/NEWS b/Misc/NEWS index 60d546f5368..265b88147e1 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1? Core and Builtins ----------------- +- Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead + of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment + variable is not set, the locale encoding is ISO-8859-1, whereas most programs + (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem + encoding and to encode command line arguments on this OS. + - Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects filenames encoded to the filesystem encoding with surrogateescape error handler (to support undecodable bytes), instead of UTF-8 in strict mode. diff --git a/Modules/python.c b/Modules/python.c index 47685a419a4..18f9b3dd668 100644 --- a/Modules/python.c +++ b/Modules/python.c @@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv) } #else +#ifdef __APPLE__ +extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size); +#endif + int main(int argc, char **argv) { @@ -41,7 +45,11 @@ main(int argc, char **argv) oldloc = strdup(setlocale(LC_ALL, NULL)); setlocale(LC_ALL, ""); for (i = 0; i < argc; i++) { +#ifdef __APPLE__ + argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i])); +#else argv_copy[i] = _Py_char2wchar(argv[i], NULL); +#endif if (!argv_copy[i]) return 1; argv_copy2[i] = argv_copy[i]; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7564b67a215..f5c09dd7f8e 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s, #undef ASCII_CHAR_MASK +#ifdef __APPLE__ + +/* Simplified UTF-8 decoder using surrogateescape error handler, + used to decode the command line arguments on Mac OS X. */ + +wchar_t* +_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) +{ + int n; + const char *e; + wchar_t *unicode, *p; + + /* Note: size will always be longer than the resulting Unicode + character count */ + if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) { + PyErr_NoMemory(); + return NULL; + } + unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t)); + if (!unicode) + return NULL; + + /* Unpack UTF-8 encoded data */ + p = unicode; + e = s + size; + while (s < e) { + Py_UCS4 ch = (unsigned char)*s; + + if (ch < 0x80) { + *p++ = (wchar_t)ch; + s++; + continue; + } + + n = utf8_code_length[ch]; + if (s + n > e) { + goto surrogateescape; + } + + switch (n) { + case 0: + case 1: + goto surrogateescape; + + case 2: + if ((s[1] & 0xc0) != 0x80) + goto surrogateescape; + ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); + assert ((ch > 0x007F) && (ch <= 0x07FF)); + *p++ = (wchar_t)ch; + break; + + case 3: + /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf + will result in surrogates in range d800-dfff. Surrogates are + not valid UTF-8 so they are rejected. + See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf + (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */ + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + ((unsigned char)s[0] == 0xE0 && + (unsigned char)s[1] < 0xA0) || + ((unsigned char)s[0] == 0xED && + (unsigned char)s[1] > 0x9F)) { + + goto surrogateescape; + } + ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); + assert ((ch > 0x07FF) && (ch <= 0xFFFF)); + *p++ = (Py_UNICODE)ch; + break; + + case 4: + if ((s[1] & 0xc0) != 0x80 || + (s[2] & 0xc0) != 0x80 || + (s[3] & 0xc0) != 0x80 || + ((unsigned char)s[0] == 0xF0 && + (unsigned char)s[1] < 0x90) || + ((unsigned char)s[0] == 0xF4 && + (unsigned char)s[1] > 0x8F)) { + goto surrogateescape; + } + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + + ((s[2] & 0x3f) << 6) + (s[3] & 0x3f); + assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); + +#if SIZEOF_WCHAR_T == 4 + *p++ = (wchar_t)ch; +#else + /* compute and append the two surrogates: */ + + /* translate from 10000..10FFFF to 0..FFFF */ + ch -= 0x10000; + + /* high surrogate = top 10 bits added to D800 */ + *p++ = (wchar_t)(0xD800 + (ch >> 10)); + + /* low surrogate = bottom 10 bits added to DC00 */ + *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF)); +#endif + break; + } + s += n; + continue; + + surrogateescape: + *p++ = 0xDC00 + ch; + s++; + } + *p = L'\0'; + return unicode; +} + +#endif /* __APPLE__ */ /* Allocation strategy: if the string is short, convert into a stack buffer and allocate exactly as much space needed at the end. Else allocate the