Issue #11303: Added shortcuts for utf8 and latin1 encodings.

Documented the list of optimized encodings as CPython implementation
detail.
This commit is contained in:
Alexander Belopolsky 2011-02-25 19:19:57 +00:00
parent eea22d2d66
commit 1d52146a25
2 changed files with 19 additions and 4 deletions

View File

@ -904,6 +904,15 @@ is meant to be exhaustive. Notice that spelling alternatives that only differ in
case or use a hyphen instead of an underscore are also valid aliases; therefore,
e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec.
.. impl-detail::
Some common encodings can bypass the codecs lookup machinery to
improve performance. These optimization opportunities are only
recognized by CPython for a limited set of aliases: utf-8, utf8,
latin-1, latin1, iso-8859-1, mbcs (Windows only), ascii, utf-16,
and utf-32. Using alternative spellings for these encodings may
result in slower execution.
Many of the character sets support the same languages. They vary in individual
characters (e.g. whether the EURO SIGN is supported or not), and in the
assignment of characters to code positions. For the European languages in

View File

@ -1462,13 +1462,15 @@ PyObject *PyUnicode_Decode(const char *s,
char lower[11]; /* Enough for any encoding shortcut */
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
return PyUnicode_DecodeUTF8(s, size, errors);
/* Shortcuts for common default encodings */
if (normalize_encoding(encoding, lower, sizeof(lower))) {
if (strcmp(lower, "utf-8") == 0)
if ((strcmp(lower, "utf-8") == 0) ||
(strcmp(lower, "utf8") == 0))
return PyUnicode_DecodeUTF8(s, size, errors);
else if ((strcmp(lower, "latin-1") == 0) ||
(strcmp(lower, "latin1") == 0) ||
(strcmp(lower, "iso-8859-1") == 0))
return PyUnicode_DecodeLatin1(s, size, errors);
#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
@ -1670,15 +1672,19 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
}
if (encoding == NULL)
encoding = PyUnicode_GetDefaultEncoding();
return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
errors);
/* Shortcuts for common default encodings */
if (normalize_encoding(encoding, lower, sizeof(lower))) {
if (strcmp(lower, "utf-8") == 0)
if ((strcmp(lower, "utf-8") == 0) ||
(strcmp(lower, "utf8") == 0))
return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
errors);
else if ((strcmp(lower, "latin-1") == 0) ||
(strcmp(lower, "latin1") == 0) ||
(strcmp(lower, "iso-8859-1") == 0))
return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),