From e7bf86cd7d7c9a3924501875a08c4ef4a0063103 Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 9 Oct 2015 01:39:28 +0200 Subject: [PATCH] Optimize backslashreplace error handler Issue #25318: Optimize backslashreplace and xmlcharrefreplace error handlers in UTF-8 encoder. Optimize also backslashreplace error handler for ASCII and Latin1 encoders. Use the new _PyBytesWriter API to optimize these error handlers for the encoders. It avoids to create an exception and call the slow implementation of the error handler. --- Objects/stringlib/codecs.h | 18 +++- Objects/unicodeobject.c | 193 +++++++++++++++++++++++++++---------- 2 files changed, 160 insertions(+), 51 deletions(-) diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index d7a991855bd..ae99d1a82df 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -334,7 +334,6 @@ STRINGLIB(utf8_encoder)(PyObject *unicode, i += (endpos - startpos - 1); break; - case _Py_ERROR_SURROGATEPASS: for (k=startpos; k PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + prealloc = prealloc_per_char * (collend - collstart); + if (size > prealloc) { + str = _PyBytesWriter_Prepare(writer, str, size - prealloc); + if (str == NULL) + return NULL; + } + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + ch = PyUnicode_READ(kind, data, i); + if (ch < 0x100) + str += sprintf(str, "\\x%02x", ch); + else if (ch < 0x10000) + str += sprintf(str, "\\u%04x", ch); + else { + assert(ch <= MAX_UNICODE); + str += sprintf(str, "\\U%08x", ch); + } + } + return str; +} + +/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings: + ASCII, Latin1, UTF-8, etc. */ +static char* +xmlcharrefreplace(_PyBytesWriter *writer, Py_ssize_t prealloc_per_char, + char *str, + PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend) +{ + Py_ssize_t size, i, prealloc; + Py_UCS4 ch; + enum PyUnicode_Kind kind; + void *data; + + assert(PyUnicode_IS_READY(unicode)); + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + size = 0; + /* determine replacement size */ + for (i = collstart; i < collend; ++i) { + Py_ssize_t incr; + + ch = PyUnicode_READ(kind, data, i); + if (ch < 10) + incr = 2+1+1; + else if (ch < 100) + incr = 2+2+1; + else if (ch < 1000) + incr = 2+3+1; + else if (ch < 10000) + incr = 2+4+1; + else if (ch < 100000) + incr = 2+5+1; + else if (ch < 1000000) + incr = 2+6+1; + else { + assert(ch <= MAX_UNICODE); + incr = 2+7+1; + } + if (size > PY_SSIZE_T_MAX - incr) { + PyErr_SetString(PyExc_OverflowError, + "encoded result is too long for a Python string"); + return NULL; + } + size += incr; + } + + prealloc = prealloc_per_char * (collend - collstart); + if (size > prealloc) { + str = _PyBytesWriter_Prepare(writer, str, size - prealloc); + if (str == NULL) + return NULL; + } + + /* generate replacement */ + for (i = collstart; i < collend; ++i) { + str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); + } + return str; +} + /* --- Bloom Filters ----------------------------------------------------- */ /* stuff to implement simple "bloom filters" for Unicode characters. @@ -6713,7 +6834,6 @@ unicode_encode_ucs1(PyObject *unicode, ++pos; } else { - Py_ssize_t requiredsize; PyObject *repunicode; Py_ssize_t repsize, newpos, i; /* startpos for collecting unencodable chars */ @@ -6744,42 +6864,19 @@ unicode_encode_ucs1(PyObject *unicode, pos = collend; break; - case _Py_ERROR_XMLCHARREFREPLACE: - requiredsize = 0; - /* determine replacement size */ - for (i = collstart; i < collend; ++i) { - Py_ssize_t incr; - - ch = PyUnicode_READ(kind, data, i); - if (ch < 10) - incr = 2+1+1; - else if (ch < 100) - incr = 2+2+1; - else if (ch < 1000) - incr = 2+3+1; - else if (ch < 10000) - incr = 2+4+1; - else if (ch < 100000) - incr = 2+5+1; - else if (ch < 1000000) - incr = 2+6+1; - else { - assert(ch <= MAX_UNICODE); - incr = 2+7+1; - } - if (requiredsize > PY_SSIZE_T_MAX - incr) - goto overflow; - requiredsize += incr; - } - - str = _PyBytesWriter_Prepare(&writer, str, requiredsize-1); + case _Py_ERROR_BACKSLASHREPLACE: + str = backslashreplace(&writer, 1, str, + unicode, collstart, collend); if (str == NULL) goto onError; + pos = collend; + break; - /* generate replacement */ - for (i = collstart; i < collend; ++i) { - str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i)); - } + case _Py_ERROR_XMLCHARREFREPLACE: + str = xmlcharrefreplace(&writer, 1, str, + unicode, collstart, collend); + if (str == NULL) + goto onError; pos = collend; break; @@ -6810,9 +6907,11 @@ unicode_encode_ucs1(PyObject *unicode, if (PyBytes_Check(repunicode)) { /* Directly copy bytes result to output. */ repsize = PyBytes_Size(repunicode); - str = _PyBytesWriter_Prepare(&writer, str, repsize-1); - if (str == NULL) - goto onError; + if (repsize > 1) { + str = _PyBytesWriter_Prepare(&writer, str, repsize-1); + if (str == NULL) + goto onError; + } memcpy(str, PyBytes_AsString(repunicode), repsize); str += repsize; pos = newpos; @@ -6856,10 +6955,6 @@ unicode_encode_ucs1(PyObject *unicode, Py_XDECREF(exc); return _PyBytesWriter_Finish(&writer, str); - overflow: - PyErr_SetString(PyExc_OverflowError, - "encoded result is too long for a Python string"); - onError: _PyBytesWriter_Dealloc(&writer); Py_XDECREF(error_handler_obj);