mirror of https://github.com/python/cpython.git
Add PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale()
* PyUnicode_DecodeLocaleAndSize() and PyUnicode_DecodeLocale() decode a string from the current locale encoding * _Py_char2wchar() writes an "error code" in the size argument to indicate if the function failed because of memory allocation failure or because of a decoding error. The function doesn't write the error message directly to stderr. * Fix time.strftime() (if wcsftime() is missing): decode strftime() result from the current locale encoding, not from the filesystem encoding.
This commit is contained in:
parent
3607e3de27
commit
af02e1c85a
|
@ -699,6 +699,39 @@ Extension modules can continue using them, as they will not be removed in Python
|
|||
throughout the interpreter whenever coercion to Unicode is needed.
|
||||
|
||||
|
||||
Locale Encoding
|
||||
"""""""""""""""
|
||||
|
||||
The current locale encoding can be used to decode text from the operating
|
||||
system.
|
||||
|
||||
.. c:function:: PyObject* PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len, int surrogateescape)
|
||||
|
||||
Decode a string from the current locale encoding. The decoder is strict if
|
||||
*surrogateescape* is equal to zero, otherwise it uses the
|
||||
``'surrogateescape'`` error handler (:pep:`383`) to escape undecodable
|
||||
bytes. If a byte sequence can be decoded as a surrogate character and
|
||||
*surrogateescape* is not equal to zero, the byte sequence is escaped using
|
||||
the ``'surrogateescape'`` error handler instead of being decoded. *str*
|
||||
must end with a null character but cannot contain embedded null character.
|
||||
|
||||
.. seealso::
|
||||
|
||||
Use :c:func:`PyUnicode_DecodeFSDefaultAndSize` to decode a string from
|
||||
:c:data:`Py_FileSystemDefaultEncoding` (the locale encoding read at
|
||||
Python startup).
|
||||
|
||||
.. versionadded:: 3.3
|
||||
|
||||
|
||||
.. c:function:: PyObject* PyUnicode_DecodeLocale(const char *str, int surrogateescape)
|
||||
|
||||
Similar to :c:func:`PyUnicode_DecodeLocaleAndSize`, but compute the string
|
||||
length using :c:func:`strlen`.
|
||||
|
||||
.. versionadded:: 3.3
|
||||
|
||||
|
||||
File System Encoding
|
||||
""""""""""""""""""""
|
||||
|
||||
|
@ -739,6 +772,13 @@ used, passing :c:func:`PyUnicode_FSDecoder` as the conversion function:
|
|||
If :c:data:`Py_FileSystemDefaultEncoding` is not set, fall back to the
|
||||
locale encoding.
|
||||
|
||||
.. seealso::
|
||||
|
||||
:c:data:`Py_FileSystemDefaultEncoding` is initialized at startup from the
|
||||
locale encoding and cannot be modified later. If you need to decode a
|
||||
string from the current locale encoding, use
|
||||
:c:func:`PyUnicode_DecodeLocaleAndSize`.
|
||||
|
||||
.. versionchanged:: 3.2
|
||||
Use ``'strict'`` error handler on Windows.
|
||||
|
||||
|
|
|
@ -1595,6 +1595,28 @@ PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
|
|||
);
|
||||
#endif
|
||||
|
||||
/* --- Locale encoding --------------------------------------------------- */
|
||||
|
||||
/* Decode a string from the current locale encoding. The decoder is strict if
|
||||
*surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
|
||||
error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
|
||||
be decoded as a surrogate character and *surrogateescape* is not equal to
|
||||
zero, the byte sequence is escaped using the 'surrogateescape' error handler
|
||||
instead of being decoded. *str* must end with a null character but cannot
|
||||
contain embedded null character. */
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
|
||||
const char *str,
|
||||
Py_ssize_t len,
|
||||
int surrogateescape);
|
||||
|
||||
/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
|
||||
length using strlen(). */
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
|
||||
const char *str,
|
||||
int surrogateescape);
|
||||
|
||||
/* --- File system encoding ---------------------------------------------- */
|
||||
|
||||
/* ParseTuple converter: encode str objects to bytes using
|
||||
|
|
|
@ -42,43 +42,6 @@ PyDoc_STRVAR(locale__doc__, "Support for POSIX locales.");
|
|||
|
||||
static PyObject *Error;
|
||||
|
||||
/* Convert a char* to a Unicode object according to the current locale */
|
||||
static PyObject*
|
||||
str2uni(const char* s)
|
||||
{
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
size_t needed = strlen(s);
|
||||
#else
|
||||
size_t needed = mbstowcs(NULL, s, 0);
|
||||
#endif
|
||||
size_t res1;
|
||||
wchar_t smallbuf[30];
|
||||
wchar_t *dest;
|
||||
PyObject *res2;
|
||||
if (needed == (size_t)-1) {
|
||||
PyErr_SetString(PyExc_ValueError, "Cannot convert byte to string");
|
||||
return NULL;
|
||||
}
|
||||
if (needed*sizeof(wchar_t) < sizeof(smallbuf))
|
||||
dest = smallbuf;
|
||||
else {
|
||||
dest = PyMem_Malloc((needed+1)*sizeof(wchar_t));
|
||||
if (!dest)
|
||||
return PyErr_NoMemory();
|
||||
}
|
||||
/* This shouldn't fail now */
|
||||
res1 = mbstowcs(dest, s, needed+1);
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
assert(res1 != (size_t)-1);
|
||||
#else
|
||||
assert(res1 == needed);
|
||||
#endif
|
||||
res2 = PyUnicode_FromWideChar(dest, res1);
|
||||
if (dest != smallbuf)
|
||||
PyMem_Free(dest);
|
||||
return res2;
|
||||
}
|
||||
|
||||
/* support functions for formatting floating point numbers */
|
||||
|
||||
PyDoc_STRVAR(setlocale__doc__,
|
||||
|
@ -149,7 +112,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
|
|||
PyErr_SetString(Error, "unsupported locale setting");
|
||||
return NULL;
|
||||
}
|
||||
result_object = str2uni(result);
|
||||
result_object = PyUnicode_DecodeLocale(result, 0);
|
||||
if (!result_object)
|
||||
return NULL;
|
||||
} else {
|
||||
|
@ -159,7 +122,7 @@ PyLocale_setlocale(PyObject* self, PyObject* args)
|
|||
PyErr_SetString(Error, "locale query failed");
|
||||
return NULL;
|
||||
}
|
||||
result_object = str2uni(result);
|
||||
result_object = PyUnicode_DecodeLocale(result, 0);
|
||||
}
|
||||
return result_object;
|
||||
}
|
||||
|
@ -185,7 +148,7 @@ PyLocale_localeconv(PyObject* self)
|
|||
involved herein */
|
||||
|
||||
#define RESULT_STRING(s)\
|
||||
x = str2uni(l->s); \
|
||||
x = PyUnicode_DecodeLocale(l->s, 0); \
|
||||
if (!x) goto failed;\
|
||||
PyDict_SetItemString(result, #s, x);\
|
||||
Py_XDECREF(x)
|
||||
|
@ -476,7 +439,7 @@ PyLocale_nl_langinfo(PyObject* self, PyObject* args)
|
|||
instead of an empty string for nl_langinfo(ERA). */
|
||||
const char *result = nl_langinfo(item);
|
||||
result = result != NULL ? result : "";
|
||||
return str2uni(result);
|
||||
return PyUnicode_DecodeLocale(result, 0);
|
||||
}
|
||||
PyErr_SetString(PyExc_ValueError, "unsupported langinfo constant");
|
||||
return NULL;
|
||||
|
@ -495,7 +458,7 @@ PyIntl_gettext(PyObject* self, PyObject *args)
|
|||
char *in;
|
||||
if (!PyArg_ParseTuple(args, "s", &in))
|
||||
return 0;
|
||||
return str2uni(gettext(in));
|
||||
return PyUnicode_DecodeLocale(gettext(in), 0);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(dgettext__doc__,
|
||||
|
@ -508,7 +471,7 @@ PyIntl_dgettext(PyObject* self, PyObject *args)
|
|||
char *domain, *in;
|
||||
if (!PyArg_ParseTuple(args, "zs", &domain, &in))
|
||||
return 0;
|
||||
return str2uni(dgettext(domain, in));
|
||||
return PyUnicode_DecodeLocale(dgettext(domain, in), 0);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(dcgettext__doc__,
|
||||
|
@ -522,7 +485,7 @@ PyIntl_dcgettext(PyObject *self, PyObject *args)
|
|||
int category;
|
||||
if (!PyArg_ParseTuple(args, "zsi", &domain, &msgid, &category))
|
||||
return 0;
|
||||
return str2uni(dcgettext(domain,msgid,category));
|
||||
return PyUnicode_DecodeLocale(dcgettext(domain,msgid,category), 0);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(textdomain__doc__,
|
||||
|
@ -540,7 +503,7 @@ PyIntl_textdomain(PyObject* self, PyObject* args)
|
|||
PyErr_SetFromErrno(PyExc_OSError);
|
||||
return NULL;
|
||||
}
|
||||
return str2uni(domain);
|
||||
return PyUnicode_DecodeLocale(domain, 0);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(bindtextdomain__doc__,
|
||||
|
@ -572,7 +535,7 @@ PyIntl_bindtextdomain(PyObject* self,PyObject*args)
|
|||
PyErr_SetFromErrno(PyExc_OSError);
|
||||
return NULL;
|
||||
}
|
||||
result = str2uni(current_dirname);
|
||||
result = PyUnicode_DecodeLocale(current_dirname, 0);
|
||||
Py_XDECREF(dirname_bytes);
|
||||
return result;
|
||||
}
|
||||
|
@ -590,7 +553,7 @@ PyIntl_bind_textdomain_codeset(PyObject* self,PyObject*args)
|
|||
return NULL;
|
||||
codeset = bind_textdomain_codeset(domain, codeset);
|
||||
if (codeset)
|
||||
return str2uni(codeset);
|
||||
return PyUnicode_DecodeLocale(codeset, 0);
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -495,16 +495,13 @@ Py_Main(int argc, wchar_t **argv)
|
|||
/* Use utf-8 on Mac OS X */
|
||||
unicode = PyUnicode_FromString(p);
|
||||
#else
|
||||
wchar_t *wchar;
|
||||
size_t len;
|
||||
wchar = _Py_char2wchar(p, &len);
|
||||
if (wchar == NULL)
|
||||
continue;
|
||||
unicode = PyUnicode_FromWideChar(wchar, len);
|
||||
PyMem_Free(wchar);
|
||||
unicode = PyUnicode_DecodeLocale(p, 1);
|
||||
#endif
|
||||
if (unicode == NULL)
|
||||
if (unicode == NULL) {
|
||||
/* ignore errors */
|
||||
PyErr_Clear();
|
||||
continue;
|
||||
}
|
||||
PySys_AddWarnOptionUnicode(unicode);
|
||||
Py_DECREF(unicode);
|
||||
}
|
||||
|
|
|
@ -532,7 +532,7 @@ time_strftime(PyObject *self, PyObject *args)
|
|||
#ifdef HAVE_WCSFTIME
|
||||
ret = PyUnicode_FromWideChar(outbuf, buflen);
|
||||
#else
|
||||
ret = PyUnicode_DecodeFSDefaultAndSize(outbuf, buflen);
|
||||
ret = PyUnicode_DecodeLocaleAndSize(outbuf, buflen, 1);
|
||||
#endif
|
||||
PyMem_Free(outbuf);
|
||||
break;
|
||||
|
@ -764,8 +764,8 @@ PyInit_timezone(PyObject *m) {
|
|||
#endif /* PYOS_OS2 */
|
||||
#endif
|
||||
PyModule_AddIntConstant(m, "daylight", daylight);
|
||||
otz0 = PyUnicode_DecodeFSDefaultAndSize(tzname[0], strlen(tzname[0]));
|
||||
otz1 = PyUnicode_DecodeFSDefaultAndSize(tzname[1], strlen(tzname[1]));
|
||||
otz0 = PyUnicode_DecodeLocale(tzname[0], 1);
|
||||
otz1 = PyUnicode_DecodeLocale(tzname[1], 1);
|
||||
PyModule_AddObject(m, "tzname", Py_BuildValue("(NN)", otz0, otz1));
|
||||
#else /* !HAVE_TZNAME || __GLIBC__ || __CYGWIN__*/
|
||||
#ifdef HAVE_STRUCT_TM_TM_ZONE
|
||||
|
|
|
@ -3234,6 +3234,83 @@ PyUnicode_AsEncodedUnicode(PyObject *unicode,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
PyObject*
|
||||
PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
|
||||
int surrogateescape)
|
||||
{
|
||||
wchar_t smallbuf[256];
|
||||
size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
|
||||
wchar_t *wstr;
|
||||
size_t wlen, wlen2;
|
||||
PyObject *unicode;
|
||||
|
||||
if (str[len] != '\0' || len != strlen(str)) {
|
||||
PyErr_SetString(PyExc_TypeError, "embedded null character");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (surrogateescape)
|
||||
{
|
||||
wstr = _Py_char2wchar(str, &wlen);
|
||||
if (wstr == NULL) {
|
||||
if (wlen == (size_t)-1)
|
||||
PyErr_NoMemory();
|
||||
else
|
||||
PyErr_SetFromErrno(PyExc_OSError);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unicode = PyUnicode_FromWideChar(wstr, wlen);
|
||||
PyMem_Free(wstr);
|
||||
}
|
||||
else {
|
||||
#ifndef HAVE_BROKEN_MBSTOWCS
|
||||
wlen = mbstowcs(NULL, str, 0);
|
||||
#else
|
||||
wlen = len;
|
||||
#endif
|
||||
if (wlen == (size_t)-1) {
|
||||
PyErr_SetFromErrno(PyExc_OSError);
|
||||
return NULL;
|
||||
}
|
||||
if (wlen+1 <= smallbuf_len) {
|
||||
wstr = smallbuf;
|
||||
}
|
||||
else {
|
||||
if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
|
||||
return PyErr_NoMemory();
|
||||
|
||||
wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
|
||||
if (!wstr)
|
||||
return PyErr_NoMemory();
|
||||
}
|
||||
|
||||
/* This shouldn't fail now */
|
||||
wlen2 = mbstowcs(wstr, str, wlen+1);
|
||||
if (wlen2 == (size_t)-1) {
|
||||
if (wstr != smallbuf)
|
||||
PyMem_Free(wstr);
|
||||
PyErr_SetFromErrno(PyExc_OSError);
|
||||
return NULL;
|
||||
}
|
||||
#ifdef HAVE_BROKEN_MBSTOWCS
|
||||
assert(wlen2 == wlen);
|
||||
#endif
|
||||
unicode = PyUnicode_FromWideChar(wstr, wlen2);
|
||||
if (wstr != smallbuf)
|
||||
PyMem_Free(wstr);
|
||||
}
|
||||
return unicode;
|
||||
}
|
||||
|
||||
PyObject*
|
||||
PyUnicode_DecodeLocale(const char *str, int surrogateescape)
|
||||
{
|
||||
Py_ssize_t size = (Py_ssize_t)strlen(str);
|
||||
return PyUnicode_DecodeLocaleAndSize(str, size, surrogateescape);
|
||||
}
|
||||
|
||||
|
||||
PyObject*
|
||||
PyUnicode_DecodeFSDefault(const char *s) {
|
||||
Py_ssize_t size = (Py_ssize_t)strlen(s);
|
||||
|
@ -3264,23 +3341,7 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
|
|||
"surrogateescape");
|
||||
}
|
||||
else {
|
||||
/* locale encoding with surrogateescape */
|
||||
wchar_t *wchar;
|
||||
PyObject *unicode;
|
||||
size_t len;
|
||||
|
||||
if (s[size] != '\0' || size != strlen(s)) {
|
||||
PyErr_SetString(PyExc_TypeError, "embedded NUL character");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
wchar = _Py_char2wchar(s, &len);
|
||||
if (wchar == NULL)
|
||||
return PyErr_NoMemory();
|
||||
|
||||
unicode = PyUnicode_FromWideChar(wchar, len);
|
||||
PyMem_Free(wchar);
|
||||
return unicode;
|
||||
return PyUnicode_DecodeLocaleAndSize(s, size, 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -16,7 +16,9 @@
|
|||
Return a pointer to a newly allocated wide character string (use
|
||||
PyMem_Free() to free the memory) and write the number of written wide
|
||||
characters excluding the null character into *size if size is not NULL, or
|
||||
NULL on error (conversion or memory allocation error).
|
||||
NULL on error (decoding or memory allocation error). If size is not NULL,
|
||||
*size is set to (size_t)-1 on memory error and (size_t)-2 on decoding
|
||||
error.
|
||||
|
||||
Conversion errors should never happen, unless there is a bug in the C
|
||||
library. */
|
||||
|
@ -82,8 +84,9 @@ _Py_char2wchar(const char* arg, size_t *size)
|
|||
since we provide everything that we have -
|
||||
unless there is a bug in the C library, or I
|
||||
misunderstood how mbrtowc works. */
|
||||
fprintf(stderr, "unexpected mbrtowc result -2\n");
|
||||
PyMem_Free(res);
|
||||
if (size != NULL)
|
||||
*size = (size_t)-2;
|
||||
return NULL;
|
||||
}
|
||||
if (converted == (size_t)-1) {
|
||||
|
@ -112,7 +115,8 @@ _Py_char2wchar(const char* arg, size_t *size)
|
|||
is ASCII (i.e. escape all bytes > 128. This will still roundtrip
|
||||
correctly in the locale's charset, which must be an ASCII superset. */
|
||||
res = PyMem_Malloc((strlen(arg)+1)*sizeof(wchar_t));
|
||||
if (!res) goto oom;
|
||||
if (!res)
|
||||
goto oom;
|
||||
in = (unsigned char*)arg;
|
||||
out = res;
|
||||
while(*in)
|
||||
|
@ -126,7 +130,8 @@ _Py_char2wchar(const char* arg, size_t *size)
|
|||
*size = out - res;
|
||||
return res;
|
||||
oom:
|
||||
fprintf(stderr, "out of memory\n");
|
||||
if (size != NULL)
|
||||
*size = (size_t)-1;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -137,10 +142,10 @@ _Py_char2wchar(const char* arg, size_t *size)
|
|||
This function is the reverse of _Py_char2wchar().
|
||||
|
||||
Return a pointer to a newly allocated byte string (use PyMem_Free() to free
|
||||
the memory), or NULL on conversion or memory allocation error.
|
||||
the memory), or NULL on encoding or memory allocation error.
|
||||
|
||||
If error_pos is not NULL: *error_pos is the index of the invalid character
|
||||
on conversion error, or (size_t)-1 otherwise. */
|
||||
on encoding error, or (size_t)-1 otherwise. */
|
||||
char*
|
||||
_Py_wchar2char(const wchar_t *text, size_t *error_pos)
|
||||
{
|
||||
|
@ -328,7 +333,7 @@ _Py_fopen(PyObject *path, const char *mode)
|
|||
#ifdef HAVE_READLINK
|
||||
|
||||
/* Read value of symbolic link. Encode the path to the locale encoding, decode
|
||||
the result from the locale encoding. */
|
||||
the result from the locale encoding. Return -1 on error. */
|
||||
|
||||
int
|
||||
_Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
|
||||
|
@ -372,7 +377,8 @@ _Py_wreadlink(const wchar_t *path, wchar_t *buf, size_t bufsiz)
|
|||
#ifdef HAVE_REALPATH
|
||||
|
||||
/* Return the canonicalized absolute pathname. Encode path to the locale
|
||||
encoding, decode the result from the locale encoding. */
|
||||
encoding, decode the result from the locale encoding.
|
||||
Return NULL on error. */
|
||||
|
||||
wchar_t*
|
||||
_Py_wrealpath(const wchar_t *path,
|
||||
|
@ -410,7 +416,8 @@ _Py_wrealpath(const wchar_t *path,
|
|||
#endif
|
||||
|
||||
/* Get the current directory. size is the buffer size in wide characters
|
||||
including the null character. Decode the path from the locale encoding. */
|
||||
including the null character. Decode the path from the locale encoding.
|
||||
Return NULL on error. */
|
||||
|
||||
wchar_t*
|
||||
_Py_wgetcwd(wchar_t *buf, size_t size)
|
||||
|
|
Loading…
Reference in New Issue