From 63065d761e6c545216b9621982d16dd459abb1f8 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Tue, 15 May 2012 23:48:04 +0200 Subject: [PATCH] Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs. Patch by Serhiy Storchaka. --- Misc/NEWS | 3 + Objects/stringlib/codecs.h | 149 +++++++++++++++++++- Objects/unicodeobject.c | 277 +++++++++++-------------------------- 3 files changed, 230 insertions(+), 199 deletions(-) diff --git a/Misc/NEWS b/Misc/NEWS index 48709799e34..2ff55e8d905 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.3.0 Alpha 4? Core and Builtins ----------------- +- Issue #14624: UTF-16 decoding is now 3x to 4x faster on various inputs. + Patch by Serhiy Storchaka. + - asdl_seq and asdl_int_seq are now Py_ssize_t sized. - Issue #14133 (PEP 415): Implement suppression of __context__ display with an diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 366011c7a85..07627d6ff99 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -215,7 +215,6 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end, goto Return; } -#undef LONG_PTR_MASK #undef ASCII_CHAR_MASK @@ -415,4 +414,152 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, #undef MAX_SHORT_UNICHARS } +/* The pattern for constructing UCS2-repeated masks. */ +#if SIZEOF_LONG == 8 +# define UCS2_REPEAT_MASK 0x0001000100010001ul +#elif SIZEOF_LONG == 4 +# define UCS2_REPEAT_MASK 0x00010001ul +#else +# error C 'long' size should be either 4 or 8! +#endif + +/* The mask for fast checking. */ +#if STRINGLIB_SIZEOF_CHAR == 1 +/* The mask for fast checking of whether a C 'long' contains a + non-ASCII or non-Latin1 UTF16-encoded characters. */ +# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR)) +#else +/* The mask for fast checking of whether a C 'long' may contain + UTF16-encoded surrogate characters. This is an efficient heuristic, + assuming that non-surrogate characters with a code point >= 0x8000 are + rare in most input. +*/ +# define FAST_CHAR_MASK (UCS2_REPEAT_MASK * 0x8000u) +#endif +/* The mask for fast byte-swapping. */ +#define STRIPPED_MASK (UCS2_REPEAT_MASK * 0x00FFu) +/* Swap bytes. */ +#define SWAB(value) ((((value) >> 8) & STRIPPED_MASK) | \ + (((value) & STRIPPED_MASK) << 8)) + +Py_LOCAL_INLINE(Py_UCS4) +STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e, + STRINGLIB_CHAR *dest, Py_ssize_t *outpos, + int native_ordering) +{ + Py_UCS4 ch; + const unsigned char *aligned_end = + (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); + const unsigned char *q = *inptr; + STRINGLIB_CHAR *p = dest + *outpos; + /* Offsets from q for retrieving byte pairs in the right order. */ +#ifdef BYTEORDER_IS_LITTLE_ENDIAN + int ihi = !!native_ordering, ilo = !native_ordering; +#else + int ihi = !native_ordering, ilo = !!native_ordering; +#endif + --e; + + while (q < e) { + Py_UCS4 ch2; + /* First check for possible aligned read of a C 'long'. Unaligned + reads are more expensive, better to defer to another iteration. */ + if (!((size_t) q & LONG_PTR_MASK)) { + /* Fast path for runs of in-range non-surrogate chars. */ + register const unsigned char *_q = q; + while (_q < aligned_end) { + unsigned long block = * (unsigned long *) _q; + if (native_ordering) { + /* Can use buffer directly */ + if (block & FAST_CHAR_MASK) + break; + } + else { + /* Need to byte-swap */ + if (block & SWAB(FAST_CHAR_MASK)) + break; +#if STRINGLIB_SIZEOF_CHAR == 1 + block >>= 8; +#else + block = SWAB(block); +#endif + } +#ifdef BYTEORDER_IS_LITTLE_ENDIAN +# if SIZEOF_LONG == 4 + p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); + p[1] = (STRINGLIB_CHAR)(block >> 16); +# elif SIZEOF_LONG == 8 + p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu); + p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); + p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); + p[3] = (STRINGLIB_CHAR)(block >> 48); +# endif +#else +# if SIZEOF_LONG == 4 + p[0] = (STRINGLIB_CHAR)(block >> 16); + p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu); +# elif SIZEOF_LONG == 8 + p[0] = (STRINGLIB_CHAR)(block >> 48); + p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu); + p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu); + p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu); +# endif +#endif + _q += SIZEOF_LONG; + p += SIZEOF_LONG / 2; + } + q = _q; + if (q >= e) + break; + } + + ch = (q[ihi] << 8) | q[ilo]; + q += 2; + if (!Py_UNICODE_IS_SURROGATE(ch)) { +#if STRINGLIB_SIZEOF_CHAR < 2 + if (ch > STRINGLIB_MAX_CHAR) + /* Out-of-range */ + goto Return; +#endif + *p++ = (STRINGLIB_CHAR)ch; + continue; + } + + /* UTF-16 code pair: */ + if (q >= e) + goto UnexpectedEnd; + if (!Py_UNICODE_IS_HIGH_SURROGATE(ch)) + goto IllegalEncoding; + ch2 = (q[ihi] << 8) | q[ilo]; + q += 2; + if (!Py_UNICODE_IS_LOW_SURROGATE(ch2)) + goto IllegalSurrogate; + ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2); +#if STRINGLIB_SIZEOF_CHAR < 4 + /* Out-of-range */ + goto Return; +#else + *p++ = (STRINGLIB_CHAR)ch; +#endif + } + ch = 0; +Return: + *inptr = q; + *outpos = p - dest; + return ch; +UnexpectedEnd: + ch = 1; + goto Return; +IllegalEncoding: + ch = 2; + goto Return; +IllegalSurrogate: + ch = 3; + goto Return; +} +#undef UCS2_REPEAT_MASK +#undef FAST_CHAR_MASK +#undef STRIPPED_MASK +#undef SWAB +#undef LONG_PTR_MASK #endif /* STRINGLIB_IS_UNICODE */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 2e1e0bd3eea..8fbc203c530 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5195,25 +5195,6 @@ PyUnicode_DecodeUTF16(const char *s, return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL); } -/* Two masks for fast checking of whether a C 'long' may contain - UTF16-encoded surrogate characters. This is an efficient heuristic, - assuming that non-surrogate characters with a code point >= 0x8000 are - rare in most input. - FAST_CHAR_MASK is used when the input is in native byte ordering, - SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering. -*/ -#if (SIZEOF_LONG == 8) -# define FAST_CHAR_MASK 0x8000800080008000L -# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L -# define STRIPPED_MASK 0x00FF00FF00FF00FFL -#elif (SIZEOF_LONG == 4) -# define FAST_CHAR_MASK 0x80008000L -# define SWAPPED_FAST_CHAR_MASK 0x00800080L -# define STRIPPED_MASK 0x00FF00FFL -#else -# error C 'long' size should be either 4 or 8! -#endif - PyObject * PyUnicode_DecodeUTF16Stateful(const char *s, Py_ssize_t size, @@ -5226,30 +5207,15 @@ PyUnicode_DecodeUTF16Stateful(const char *s, Py_ssize_t endinpos; Py_ssize_t outpos; PyObject *unicode; - const unsigned char *q, *e, *aligned_end; + const unsigned char *q, *e; int bo = 0; /* assume native ordering by default */ - int native_ordering = 0; + int native_ordering; const char *errmsg = ""; - /* Offsets from q for retrieving byte pairs in the right order. */ -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - int ihi = 1, ilo = 0; -#else - int ihi = 0, ilo = 1; -#endif PyObject *errorHandler = NULL; PyObject *exc = NULL; - /* Note: size will always be longer than the resulting Unicode - character count */ - unicode = PyUnicode_New(size, 127); - if (!unicode) - return NULL; - if (size == 0) - return unicode; - outpos = 0; - q = (unsigned char *)s; - e = q + size - 1; + e = q + size; if (byteorder) bo = *byteorder; @@ -5258,155 +5224,98 @@ PyUnicode_DecodeUTF16Stateful(const char *s, byte order setting accordingly. In native mode, the leading BOM mark is skipped, in all other modes, it is copied to the output stream as-is (giving a ZWNBSP character). */ - if (bo == 0) { - if (size >= 2) { - const Py_UCS4 bom = (q[ihi] << 8) | q[ilo]; -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - if (bom == 0xFEFF) { - q += 2; - bo = -1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = 1; - } -#else - if (bom == 0xFEFF) { - q += 2; - bo = 1; - } - else if (bom == 0xFFFE) { - q += 2; - bo = -1; - } -#endif + if (bo == 0 && size >= 2) { + const Py_UCS4 bom = (q[1] << 8) | q[0]; + if (bom == 0xFEFF) { + q += 2; + bo = -1; } + else if (bom == 0xFFFE) { + q += 2; + bo = 1; + } + if (byteorder) + *byteorder = bo; } - if (bo == -1) { - /* force LE */ - ihi = 1; - ilo = 0; - } - else if (bo == 1) { - /* force BE */ - ihi = 0; - ilo = 1; + if (q == e) { + if (consumed) + *consumed = size; + Py_INCREF(unicode_empty); + return unicode_empty; } + #ifdef BYTEORDER_IS_LITTLE_ENDIAN - native_ordering = ilo < ihi; + native_ordering = bo <= 0; #else - native_ordering = ilo > ihi; + native_ordering = bo >= 0; #endif - aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK); - while (q < e) { - Py_UCS4 ch; - /* First check for possible aligned read of a C 'long'. Unaligned - reads are more expensive, better to defer to another iteration. */ - if (!((size_t) q & LONG_PTR_MASK)) { - /* Fast path for runs of non-surrogate chars. */ - register const unsigned char *_q = q; + /* Note: size will always be longer than the resulting Unicode + character count */ + unicode = PyUnicode_New((e - q + 1) / 2, 127); + if (!unicode) + return NULL; + + outpos = 0; + while (1) { + Py_UCS4 ch = 0; + if (e - q >= 2) { int kind = PyUnicode_KIND(unicode); - void *data = PyUnicode_DATA(unicode); - while (_q < aligned_end) { - unsigned long block = * (unsigned long *) _q; - Py_UCS4 maxch; - if (native_ordering) { - /* Can use buffer directly */ - if (block & FAST_CHAR_MASK) - break; - } - else { - /* Need to byte-swap */ - if (block & SWAPPED_FAST_CHAR_MASK) - break; - block = ((block >> 8) & STRIPPED_MASK) | - ((block & STRIPPED_MASK) << 8); - } - maxch = (Py_UCS2)(block & 0xFFFF); -#if SIZEOF_LONG == 8 - ch = (Py_UCS2)((block >> 16) & 0xFFFF); - maxch = MAX_MAXCHAR(maxch, ch); - ch = (Py_UCS2)((block >> 32) & 0xFFFF); - maxch = MAX_MAXCHAR(maxch, ch); - ch = (Py_UCS2)(block >> 48); - maxch = MAX_MAXCHAR(maxch, ch); -#else - ch = (Py_UCS2)(block >> 16); - maxch = MAX_MAXCHAR(maxch, ch); -#endif - if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) { - if (unicode_widen(&unicode, outpos, maxch) < 0) - goto onError; - kind = PyUnicode_KIND(unicode); - data = PyUnicode_DATA(unicode); - } -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF)); -#if SIZEOF_LONG == 8 - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF)); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF)); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48))); -#else - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16)); -#endif -#else -#if SIZEOF_LONG == 8 - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 48))); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 32) & 0xFFFF)); - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)((block >> 16) & 0xFFFF)); -#else - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block >> 16)); -#endif - PyUnicode_WRITE(kind, data, outpos++, (Py_UCS2)(block & 0xFFFF)); -#endif - _q += SIZEOF_LONG; + if (kind == PyUnicode_1BYTE_KIND) { + if (PyUnicode_IS_ASCII(unicode)) + ch = asciilib_utf16_decode(&q, e, + PyUnicode_1BYTE_DATA(unicode), &outpos, + native_ordering); + else + ch = ucs1lib_utf16_decode(&q, e, + PyUnicode_1BYTE_DATA(unicode), &outpos, + native_ordering); + } else if (kind == PyUnicode_2BYTE_KIND) { + ch = ucs2lib_utf16_decode(&q, e, + PyUnicode_2BYTE_DATA(unicode), &outpos, + native_ordering); + } else { + assert(kind == PyUnicode_4BYTE_KIND); + ch = ucs4lib_utf16_decode(&q, e, + PyUnicode_4BYTE_DATA(unicode), &outpos, + native_ordering); } - q = _q; - if (q >= e) - break; } - ch = (q[ihi] << 8) | q[ilo]; - q += 2; - - if (!Py_UNICODE_IS_SURROGATE(ch)) { + switch (ch) + { + case 0: + /* remaining byte at the end? (size should be even) */ + if (q == e || consumed) + goto End; + errmsg = "truncated data"; + startinpos = ((const char *)q) - starts; + endinpos = ((const char *)e) - starts; + break; + /* The remaining input chars are ignored if the callback + chooses to skip the input */ + case 1: + errmsg = "unexpected end of data"; + startinpos = ((const char *)q) - 2 - starts; + endinpos = ((const char *)e) - starts; + break; + case 2: + errmsg = "illegal encoding"; + startinpos = ((const char *)q) - 2 - starts; + endinpos = startinpos + 2; + break; + case 3: + errmsg = "illegal UTF-16 surrogate"; + startinpos = ((const char *)q) - 4 - starts; + endinpos = startinpos + 2; + break; + default: if (unicode_putchar(&unicode, &outpos, ch) < 0) goto onError; continue; } - /* UTF-16 code pair: */ - if (q > e) { - errmsg = "unexpected end of data"; - startinpos = (((const char *)q) - 2) - starts; - endinpos = ((const char *)e) + 1 - starts; - goto utf16Error; - } - if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) { - Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo]; - q += 2; - if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) { - if (unicode_putchar(&unicode, &outpos, - Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0) - goto onError; - continue; - } - else { - errmsg = "illegal UTF-16 surrogate"; - startinpos = (((const char *)q)-4)-starts; - endinpos = startinpos+2; - goto utf16Error; - } - - } - errmsg = "illegal encoding"; - startinpos = (((const char *)q)-2)-starts; - endinpos = startinpos+2; - /* Fall through to report the error */ - - utf16Error: if (unicode_decode_call_errorhandler( errors, &errorHandler, @@ -5421,33 +5330,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s, &outpos)) goto onError; } - /* remaining byte at the end? (size should be even) */ - if (e == q) { - if (!consumed) { - errmsg = "truncated data"; - startinpos = ((const char *)q) - starts; - endinpos = ((const char *)e) + 1 - starts; - if (unicode_decode_call_errorhandler( - errors, - &errorHandler, - "utf16", errmsg, - &starts, - (const char **)&e, - &startinpos, - &endinpos, - &exc, - (const char **)&q, - &unicode, - &outpos)) - goto onError; - /* The remaining input chars are ignored if the callback - chooses to skip the input */ - } - } - - if (byteorder) - *byteorder = bo; +End: if (consumed) *consumed = (const char *)q-starts; @@ -5466,9 +5350,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s, return NULL; } -#undef FAST_CHAR_MASK -#undef SWAPPED_FAST_CHAR_MASK - PyObject * _PyUnicode_EncodeUTF16(PyObject *str, const char *errors,