Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead of

the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment variable is not set, the locale encoding is ISO-8859-1, whereas most programs (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem encoding and to encode command line arguments on this OS.
2010-10-20 22:58:25 +00:00 · 2010-10-20 22:58:25 +00:00 · f933e1ab6f
parent 073f759d65
commit f933e1ab6f
4 changed files with 160 additions and 0 deletions
--- a/Lib/test/test_cmd_line.py
+++ b/Lib/test/test_cmd_line.py
@ -148,6 +148,38 @@ def test_undecodable_code(self):
        if not stdout.startswith(pattern):
            raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
    @unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X')
    def test_osx_utf8(self):
        def check_output(text):
            decoded = text.decode('utf8', 'surrogateescape')
            expected = ascii(decoded).encode('ascii') + b'\n'
            env = os.environ.copy()
            # C locale gives ASCII locale encoding, but Python uses UTF-8
            # to parse the command line arguments on Mac OS X
            env['LC_ALL'] = 'C'
            p = subprocess.Popen(
                (sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
                stdout=subprocess.PIPE,
                env=env)
            stdout, stderr = p.communicate()
            self.assertEqual(stdout, expected)
            self.assertEqual(p.returncode, 0)
        # test valid utf-8
        text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
        check_output(text)
        # test invalid utf-8
        text = (
            b'\xff'         # invalid byte
            b'\xc3\xa9'     # valid utf-8 character
            b'\xc3\xff'     # invalid byte sequence
            b'\xed\xa0\x80' # lone surrogate character (invalid)
        )
        check_output(text)
    def test_unbuffered_output(self):
        # Test expected operation of the '-u' switch
        for stream in ('stdout', 'stderr'):
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1?
 Core and Builtins
 -----------------
 - Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead
  of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment
  variable is not set, the locale encoding is ISO-8859-1, whereas most programs
  (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem
  encoding and to encode command line arguments on this OS.
 - Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects
  filenames encoded to the filesystem encoding with surrogateescape error
  handler (to support undecodable bytes), instead of UTF-8 in strict mode.
--- a/Modules/python.c
+++ b/Modules/python.c
@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv)
 }
 #else
 #ifdef __APPLE__
 extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
 #endif
 int
 main(int argc, char **argv)
 {
@ -41,7 +45,11 @@ main(int argc, char **argv)
    oldloc = strdup(setlocale(LC_ALL, NULL));
    setlocale(LC_ALL, "");
    for (i = 0; i < argc; i++) {
 #ifdef __APPLE__
        argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
 #else
        argv_copy[i] = _Py_char2wchar(argv[i], NULL);
 #endif
        if (!argv_copy[i])
            return 1;
        argv_copy2[i] = argv_copy[i];
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
 #undef ASCII_CHAR_MASK
 #ifdef __APPLE__
 /* Simplified UTF-8 decoder using surrogateescape error handler,
   used to decode the command line arguments on Mac OS X. */
 wchar_t*
 _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
 {
    int n;
    const char *e;
    wchar_t *unicode, *p;
    /* Note: size will always be longer than the resulting Unicode
       character count */
    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
        PyErr_NoMemory();
        return NULL;
    }
    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
    if (!unicode)
        return NULL;
    /* Unpack UTF-8 encoded data */
    p = unicode;
    e = s + size;
    while (s < e) {
        Py_UCS4 ch = (unsigned char)*s;
        if (ch < 0x80) {
            *p++ = (wchar_t)ch;
            s++;
            continue;
        }
        n = utf8_code_length[ch];
        if (s + n > e) {
            goto surrogateescape;
        }
        switch (n) {
        case 0:
        case 1:
            goto surrogateescape;
        case 2:
            if ((s[1] & 0xc0) != 0x80)
                goto surrogateescape;
            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            *p++ = (wchar_t)ch;
            break;
        case 3:
            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
               will result in surrogates in range d800-dfff. Surrogates are
               not valid UTF-8 so they are rejected.
               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xE0 &&
                 (unsigned char)s[1] < 0xA0) ||
                ((unsigned char)s[0] == 0xED &&
                 (unsigned char)s[1] > 0x9F)) {
                goto surrogateescape;
            }
            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            *p++ = (Py_UNICODE)ch;
            break;
        case 4:
            if ((s[1] & 0xc0) != 0x80 ||
                (s[2] & 0xc0) != 0x80 ||
                (s[3] & 0xc0) != 0x80 ||
                ((unsigned char)s[0] == 0xF0 &&
                 (unsigned char)s[1] < 0x90) ||
                ((unsigned char)s[0] == 0xF4 &&
                 (unsigned char)s[1] > 0x8F)) {
                goto surrogateescape;
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
 #if SIZEOF_WCHAR_T == 4
            *p++ = (wchar_t)ch;
 #else
            /*  compute and append the two surrogates: */
            /*  translate from 10000..10FFFF to 0..FFFF */
            ch -= 0x10000;
            /*  high surrogate = top 10 bits added to D800 */
            *p++ = (wchar_t)(0xD800 + (ch >> 10));
            /*  low surrogate = bottom 10 bits added to DC00 */
            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
 #endif
            break;
        }
        s += n;
        continue;
      surrogateescape:
        *p++ = 0xDC00 + ch;
        s++;
    }
    *p = L'\0';
    return unicode;
 }
 #endif /* __APPLE__ */
 /* Allocation strategy:  if the string is short, convert into a stack buffer
   and allocate exactly as much space needed at the end.  Else allocate the