mirror of https://github.com/python/cpython.git
Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead of
the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment variable is not set, the locale encoding is ISO-8859-1, whereas most programs (including Python) expect UTF-8. Python already uses UTF-8 for the filesystem encoding and to encode command line arguments on this OS.
This commit is contained in:
parent
073f759d65
commit
f933e1ab6f
|
@ -148,6 +148,38 @@ def test_undecodable_code(self):
|
||||||
if not stdout.startswith(pattern):
|
if not stdout.startswith(pattern):
|
||||||
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
|
raise AssertionError("%a doesn't start with %a" % (stdout, pattern))
|
||||||
|
|
||||||
|
@unittest.skipUnless(sys.platform == 'darwin', 'test specific to Mac OS X')
|
||||||
|
def test_osx_utf8(self):
|
||||||
|
def check_output(text):
|
||||||
|
decoded = text.decode('utf8', 'surrogateescape')
|
||||||
|
expected = ascii(decoded).encode('ascii') + b'\n'
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
# C locale gives ASCII locale encoding, but Python uses UTF-8
|
||||||
|
# to parse the command line arguments on Mac OS X
|
||||||
|
env['LC_ALL'] = 'C'
|
||||||
|
|
||||||
|
p = subprocess.Popen(
|
||||||
|
(sys.executable, "-c", "import sys; print(ascii(sys.argv[1]))", text),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
env=env)
|
||||||
|
stdout, stderr = p.communicate()
|
||||||
|
self.assertEqual(stdout, expected)
|
||||||
|
self.assertEqual(p.returncode, 0)
|
||||||
|
|
||||||
|
# test valid utf-8
|
||||||
|
text = 'e:\xe9, euro:\u20ac, non-bmp:\U0010ffff'.encode('utf-8')
|
||||||
|
check_output(text)
|
||||||
|
|
||||||
|
# test invalid utf-8
|
||||||
|
text = (
|
||||||
|
b'\xff' # invalid byte
|
||||||
|
b'\xc3\xa9' # valid utf-8 character
|
||||||
|
b'\xc3\xff' # invalid byte sequence
|
||||||
|
b'\xed\xa0\x80' # lone surrogate character (invalid)
|
||||||
|
)
|
||||||
|
check_output(text)
|
||||||
|
|
||||||
def test_unbuffered_output(self):
|
def test_unbuffered_output(self):
|
||||||
# Test expected operation of the '-u' switch
|
# Test expected operation of the '-u' switch
|
||||||
for stream in ('stdout', 'stderr'):
|
for stream in ('stdout', 'stderr'):
|
||||||
|
|
|
@ -10,6 +10,12 @@ What's New in Python 3.2 Beta 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #4388: On Mac OS X, decode command line arguments from UTF-8, instead
|
||||||
|
of the locale encoding. If the LANG (and LC_ALL and LC_CTYPE) environment
|
||||||
|
variable is not set, the locale encoding is ISO-8859-1, whereas most programs
|
||||||
|
(including Python) expect UTF-8. Python already uses UTF-8 for the filesystem
|
||||||
|
encoding and to encode command line arguments on this OS.
|
||||||
|
|
||||||
- Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects
|
- Issue #9713, #10114: Parser functions (eg. PyParser_ASTFromFile) expects
|
||||||
filenames encoded to the filesystem encoding with surrogateescape error
|
filenames encoded to the filesystem encoding with surrogateescape error
|
||||||
handler (to support undecodable bytes), instead of UTF-8 in strict mode.
|
handler (to support undecodable bytes), instead of UTF-8 in strict mode.
|
||||||
|
|
|
@ -15,6 +15,10 @@ wmain(int argc, wchar_t **argv)
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size);
|
||||||
|
#endif
|
||||||
|
|
||||||
int
|
int
|
||||||
main(int argc, char **argv)
|
main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
|
@ -41,7 +45,11 @@ main(int argc, char **argv)
|
||||||
oldloc = strdup(setlocale(LC_ALL, NULL));
|
oldloc = strdup(setlocale(LC_ALL, NULL));
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
for (i = 0; i < argc; i++) {
|
for (i = 0; i < argc; i++) {
|
||||||
|
#ifdef __APPLE__
|
||||||
|
argv_copy[i] = _Py_DecodeUTF8_surrogateescape(argv[i], strlen(argv[i]));
|
||||||
|
#else
|
||||||
argv_copy[i] = _Py_char2wchar(argv[i], NULL);
|
argv_copy[i] = _Py_char2wchar(argv[i], NULL);
|
||||||
|
#endif
|
||||||
if (!argv_copy[i])
|
if (!argv_copy[i])
|
||||||
return 1;
|
return 1;
|
||||||
argv_copy2[i] = argv_copy[i];
|
argv_copy2[i] = argv_copy[i];
|
||||||
|
|
|
@ -2716,6 +2716,120 @@ PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||||
|
|
||||||
#undef ASCII_CHAR_MASK
|
#undef ASCII_CHAR_MASK
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
|
||||||
|
/* Simplified UTF-8 decoder using surrogateescape error handler,
|
||||||
|
used to decode the command line arguments on Mac OS X. */
|
||||||
|
|
||||||
|
wchar_t*
|
||||||
|
_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
const char *e;
|
||||||
|
wchar_t *unicode, *p;
|
||||||
|
|
||||||
|
/* Note: size will always be longer than the resulting Unicode
|
||||||
|
character count */
|
||||||
|
if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
|
||||||
|
PyErr_NoMemory();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
|
||||||
|
if (!unicode)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* Unpack UTF-8 encoded data */
|
||||||
|
p = unicode;
|
||||||
|
e = s + size;
|
||||||
|
while (s < e) {
|
||||||
|
Py_UCS4 ch = (unsigned char)*s;
|
||||||
|
|
||||||
|
if (ch < 0x80) {
|
||||||
|
*p++ = (wchar_t)ch;
|
||||||
|
s++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
n = utf8_code_length[ch];
|
||||||
|
if (s + n > e) {
|
||||||
|
goto surrogateescape;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (n) {
|
||||||
|
case 0:
|
||||||
|
case 1:
|
||||||
|
goto surrogateescape;
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
if ((s[1] & 0xc0) != 0x80)
|
||||||
|
goto surrogateescape;
|
||||||
|
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
|
||||||
|
assert ((ch > 0x007F) && (ch <= 0x07FF));
|
||||||
|
*p++ = (wchar_t)ch;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
/* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
|
||||||
|
will result in surrogates in range d800-dfff. Surrogates are
|
||||||
|
not valid UTF-8 so they are rejected.
|
||||||
|
See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
|
||||||
|
(table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
|
||||||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
|
(s[2] & 0xc0) != 0x80 ||
|
||||||
|
((unsigned char)s[0] == 0xE0 &&
|
||||||
|
(unsigned char)s[1] < 0xA0) ||
|
||||||
|
((unsigned char)s[0] == 0xED &&
|
||||||
|
(unsigned char)s[1] > 0x9F)) {
|
||||||
|
|
||||||
|
goto surrogateescape;
|
||||||
|
}
|
||||||
|
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
|
||||||
|
assert ((ch > 0x07FF) && (ch <= 0xFFFF));
|
||||||
|
*p++ = (Py_UNICODE)ch;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
if ((s[1] & 0xc0) != 0x80 ||
|
||||||
|
(s[2] & 0xc0) != 0x80 ||
|
||||||
|
(s[3] & 0xc0) != 0x80 ||
|
||||||
|
((unsigned char)s[0] == 0xF0 &&
|
||||||
|
(unsigned char)s[1] < 0x90) ||
|
||||||
|
((unsigned char)s[0] == 0xF4 &&
|
||||||
|
(unsigned char)s[1] > 0x8F)) {
|
||||||
|
goto surrogateescape;
|
||||||
|
}
|
||||||
|
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
|
||||||
|
((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
|
||||||
|
assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
|
||||||
|
|
||||||
|
#if SIZEOF_WCHAR_T == 4
|
||||||
|
*p++ = (wchar_t)ch;
|
||||||
|
#else
|
||||||
|
/* compute and append the two surrogates: */
|
||||||
|
|
||||||
|
/* translate from 10000..10FFFF to 0..FFFF */
|
||||||
|
ch -= 0x10000;
|
||||||
|
|
||||||
|
/* high surrogate = top 10 bits added to D800 */
|
||||||
|
*p++ = (wchar_t)(0xD800 + (ch >> 10));
|
||||||
|
|
||||||
|
/* low surrogate = bottom 10 bits added to DC00 */
|
||||||
|
*p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
s += n;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
surrogateescape:
|
||||||
|
*p++ = 0xDC00 + ch;
|
||||||
|
s++;
|
||||||
|
}
|
||||||
|
*p = L'\0';
|
||||||
|
return unicode;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* __APPLE__ */
|
||||||
|
|
||||||
/* Allocation strategy: if the string is short, convert into a stack buffer
|
/* Allocation strategy: if the string is short, convert into a stack buffer
|
||||||
and allocate exactly as much space needed at the end. Else allocate the
|
and allocate exactly as much space needed at the end. Else allocate the
|
||||||
|
|
Loading…
Reference in New Issue