From b2750b5d334e9c8d262009069bce41c15803eca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lemburg?= Date: Fri, 6 Jun 2008 12:18:17 +0000 Subject: [PATCH] Move the codec decode type checks to bytes/bytearray.decode(). Use faster PyUnicode_FromEncodedObject() for bytes/bytearray.decode(). Add new PyCodec_KnownEncoding() API. Add new PyUnicode_AsDecodedUnicode() and PyUnicode_AsEncodedUnicode() APIs. Add missing PyUnicode_AsDecodedObject() to unicodeobject.h Fix punicode codec to also work on memoryviews. --- Include/codecs.h | 13 +++++- Include/unicodeobject.h | 36 ++++++++++++++- Lib/encodings/punycode.py | 2 + Objects/bytearrayobject.c | 4 +- Objects/bytesobject.c | 4 +- Objects/unicodeobject.c | 96 ++++++++++++++++++++++++++++++++++++--- Python/codecs.c | 45 ++++++++++-------- Python/pythonrun.c | 12 ++--- 8 files changed, 171 insertions(+), 41 deletions(-) diff --git a/Include/codecs.h b/Include/codecs.h index 0d76241dbf5..c979e86a2fc 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -27,7 +27,7 @@ PyAPI_FUNC(int) PyCodec_Register( PyObject *search_function ); -/* Codec register lookup API. +/* Codec registry lookup API. Looks up the given encoding and returns a CodecInfo object with function attributes which implement the different aspects of @@ -49,6 +49,17 @@ PyAPI_FUNC(PyObject *) _PyCodec_Lookup( const char *encoding ); +/* Codec registry encoding check API. + + Returns 1/0 depending on whether there is a registered codec for + the given encoding. + +*/ + +PyAPI_FUNC(int) PyCodec_KnownEncoding( + const char *encoding + ); + /* Generic codec based encoding API. object is passed through the encoder function found for the given diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 384cd5519f6..7af2eba7880 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -139,8 +139,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_AsASCIIString PyUnicodeUCS2_AsASCIIString # define PyUnicode_AsCharmapString PyUnicodeUCS2_AsCharmapString +# define PyUnicode_AsDecodedObject PyUnicodeUCS2_AsDecodedObject +# define PyUnicode_AsDecodedUnicode PyUnicodeUCS2_AsDecodedUnicode # define PyUnicode_AsEncodedObject PyUnicodeUCS2_AsEncodedObject # define PyUnicode_AsEncodedString PyUnicodeUCS2_AsEncodedString +# define PyUnicode_AsEncodedUnicode PyUnicodeUCS2_AsEncodedUnicode # define PyUnicode_AsLatin1String PyUnicodeUCS2_AsLatin1String # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS2_AsRawUnicodeEscapeString # define PyUnicode_AsUTF32String PyUnicodeUCS2_AsUTF32String @@ -233,8 +236,11 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_AsASCIIString PyUnicodeUCS4_AsASCIIString # define PyUnicode_AsCharmapString PyUnicodeUCS4_AsCharmapString +# define PyUnicode_AsDecodedObject PyUnicodeUCS4_AsDecodedObject +# define PyUnicode_AsDecodedUnicode PyUnicodeUCS4_AsDecodedUnicode # define PyUnicode_AsEncodedObject PyUnicodeUCS4_AsEncodedObject # define PyUnicode_AsEncodedString PyUnicodeUCS4_AsEncodedString +# define PyUnicode_AsEncodedUnicode PyUnicodeUCS4_AsEncodedUnicode # define PyUnicode_AsLatin1String PyUnicodeUCS4_AsLatin1String # define PyUnicode_AsRawUnicodeEscapeString PyUnicodeUCS4_AsRawUnicodeEscapeString # define PyUnicode_AsUTF32String PyUnicodeUCS4_AsUTF32String @@ -744,6 +750,24 @@ PyAPI_FUNC(PyObject*) PyUnicode_Decode( const char *errors /* error handling */ ); +/* Decode a Unicode object unicode and return the result as Python + object. */ + +PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject( + PyObject *unicode, /* Unicode object */ + const char *encoding, /* encoding */ + const char *errors /* error handling */ + ); + +/* Decode a Unicode object unicode and return the result as Unicode + object. */ + +PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode( + PyObject *unicode, /* Unicode object */ + const char *encoding, /* encoding */ + const char *errors /* error handling */ + ); + /* Encodes a Py_UNICODE buffer of the given size and returns a Python string object. */ @@ -772,11 +796,21 @@ PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString( const char *errors /* error handling */ ); +/* Encodes a Unicode object and returns the result as Unicode + object. */ + +PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode( + PyObject *unicode, /* Unicode object */ + const char *encoding, /* encoding */ + const char *errors /* error handling */ + ); + +/* Build an encoding map. */ + PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap( PyObject* string /* 256 character map */ ); - /* --- UTF-7 Codecs ------------------------------------------------------- */ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7( diff --git a/Lib/encodings/punycode.py b/Lib/encodings/punycode.py index b801a46092f..8129af2543a 100644 --- a/Lib/encodings/punycode.py +++ b/Lib/encodings/punycode.py @@ -183,6 +183,8 @@ def insertion_sort(base, extended, errors): def punycode_decode(text, errors): if isinstance(text, str): text = text.encode("ascii") + if isinstance(text, memoryview): + text = bytes(text) pos = text.rfind(b"-") if pos == -1: base = "" diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 75a8eef9c0b..70921c095e8 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -725,7 +725,7 @@ bytes_init(PyByteArrayObject *self, PyObject *args, PyObject *kwds) "string argument without an encoding"); return -1; } - encoded = PyCodec_Encode(arg, encoding, errors); + encoded = PyUnicode_AsEncodedString(arg, encoding, errors); if (encoded == NULL) return -1; assert(PyBytes_Check(encoded)); @@ -2854,7 +2854,7 @@ bytes_decode(PyObject *self, PyObject *args) return NULL; if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); - return PyCodec_Decode(self, encoding, errors); + return PyUnicode_FromEncodedObject(self, encoding, errors); } PyDoc_STRVAR(alloc_doc, diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index ab6207be94c..471d09c7ec5 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -2713,7 +2713,7 @@ string_decode(PyObject *self, PyObject *args) return NULL; if (encoding == NULL) encoding = PyUnicode_GetDefaultEncoding(); - return PyCodec_Decode(self, encoding, errors); + return PyUnicode_FromEncodedObject(self, encoding, errors); } @@ -2899,7 +2899,7 @@ string_new(PyTypeObject *type, PyObject *args, PyObject *kwds) "string argument without an encoding"); return NULL; } - new = PyCodec_Encode(x, encoding, errors); + new = PyUnicode_AsEncodedString(x, encoding, errors); if (new == NULL) return NULL; assert(PyBytes_Check(new)); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 78e38b5e384..fc8c8a9dd2a 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1099,14 +1099,18 @@ PyObject *PyUnicode_FromEncodedObject(register PyObject *obj, /* Coerce object */ if (PyBytes_Check(obj)) { - s = PyBytes_AS_STRING(obj); - len = PyBytes_GET_SIZE(obj); - } + s = PyBytes_AS_STRING(obj); + len = PyBytes_GET_SIZE(obj); + } + else if (PyByteArray_Check(obj)) { + s = PyByteArray_AS_STRING(obj); + len = PyByteArray_GET_SIZE(obj); + } else if (PyObject_AsCharBuffer(obj, &s, &len)) { /* Overwrite the error message with something more useful in case of a TypeError. */ if (PyErr_ExceptionMatches(PyExc_TypeError)) - PyErr_Format(PyExc_TypeError, + PyErr_Format(PyExc_TypeError, "coercing to Unicode: need string or buffer, " "%.80s found", Py_TYPE(obj)->tp_name); @@ -1188,7 +1192,7 @@ PyObject *PyUnicode_Decode(const char *s, goto onError; if (!PyUnicode_Check(unicode)) { PyErr_Format(PyExc_TypeError, - "decoder did not return an unicode object (type=%.400s)", + "decoder did not return a unicode object (type=%.400s)", Py_TYPE(unicode)->tp_name); Py_DECREF(unicode); goto onError; @@ -1225,6 +1229,37 @@ PyObject *PyUnicode_AsDecodedObject(PyObject *unicode, return NULL; } +PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode, + const char *encoding, + const char *errors) +{ + PyObject *v; + + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + goto onError; + } + + if (encoding == NULL) + encoding = PyUnicode_GetDefaultEncoding(); + + /* Decode via the codec registry */ + v = PyCodec_Decode(unicode, encoding, errors); + if (v == NULL) + goto onError; + if (!PyUnicode_Check(v)) { + PyErr_Format(PyExc_TypeError, + "decoder did not return a unicode object (type=%.400s)", + Py_TYPE(v)->tp_name); + Py_DECREF(v); + goto onError; + } + return v; + + onError: + return NULL; +} + PyObject *PyUnicode_Encode(const Py_UNICODE *s, Py_ssize_t size, const char *encoding, @@ -1296,7 +1331,54 @@ PyObject *PyUnicode_AsEncodedString(PyObject *unicode, v = PyCodec_Encode(unicode, encoding, errors); if (v == NULL) goto onError; - assert(PyBytes_Check(v)); + if (PyByteArray_Check(v)) { + char msg[100]; + PyOS_snprintf(msg, sizeof(msg), + "encoder %s returned buffer instead of bytes", + encoding); + if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { + v = NULL; + goto onError; + } + v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); + } + else if (!PyBytes_Check(v)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return a bytes object (type=%.400s)", + Py_TYPE(v)->tp_name); + v = NULL; + } + return v; + + onError: + return NULL; +} + +PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode, + const char *encoding, + const char *errors) +{ + PyObject *v; + + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + goto onError; + } + + if (encoding == NULL) + encoding = PyUnicode_GetDefaultEncoding(); + + /* Encode via the codec registry */ + v = PyCodec_Encode(unicode, encoding, errors); + if (v == NULL) + goto onError; + if (!PyUnicode_Check(v)) { + PyErr_Format(PyExc_TypeError, + "encoder did not return an unicode object (type=%.400s)", + Py_TYPE(v)->tp_name); + Py_DECREF(v); + goto onError; + } return v; onError: @@ -6617,7 +6699,7 @@ unicode_encode(PyUnicodeObject *self, PyObject *args) if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors)) return NULL; - v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors); + v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors); if (v == NULL) goto onError; if (!PyBytes_Check(v)) { diff --git a/Python/codecs.c b/Python/codecs.c index 33f0733e20c..66576c481c9 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -183,6 +183,23 @@ PyObject *_PyCodec_Lookup(const char *encoding) return NULL; } +/* Codec registry encoding check API. */ + +int PyCodec_KnownEncoding(const char *encoding) +{ + PyObject *codecs; + + codecs = _PyCodec_Lookup(encoding); + if (!codecs) { + PyErr_Clear(); + return 0; + } + else { + Py_DECREF(codecs); + return 1; + } +} + static PyObject *args_tuple(PyObject *object, const char *errors) @@ -344,32 +361,20 @@ PyObject *PyCodec_Encode(PyObject *object, "encoder must return a tuple (object, integer)"); goto onError; } - v = PyTuple_GET_ITEM(result, 0); - if (PyByteArray_Check(v)) { - char msg[100]; - PyOS_snprintf(msg, sizeof(msg), - "encoder %s returned buffer instead of bytes", - encoding); - if (PyErr_WarnEx(PyExc_RuntimeWarning, msg, 1) < 0) { - v = NULL; - goto onError; - } - v = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); - } - else if (PyBytes_Check(v)) - Py_INCREF(v); - else { - PyErr_SetString(PyExc_TypeError, - "encoding must return a tuple(bytes, integer)"); - v = NULL; - } + v = PyTuple_GET_ITEM(result,0); + Py_INCREF(v); /* We don't check or use the second (integer) entry. */ + Py_DECREF(args); + Py_DECREF(encoder); + Py_DECREF(result); + return v; + onError: Py_XDECREF(result); Py_XDECREF(args); Py_XDECREF(encoder); - return v; + return NULL; } /* Decode an object (usually a Python string) using the given encoding diff --git a/Python/pythonrun.c b/Python/pythonrun.c index 7fe4ccea551..24517e4764f 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -261,14 +261,10 @@ Py_InitializeEx(int install_sigs) codeset = nl_langinfo(CODESET); if (codeset && *codeset) { - PyObject *enc = PyCodec_Encoder(codeset); - if (enc) { - codeset = strdup(codeset); - Py_DECREF(enc); - } else { - codeset = NULL; - PyErr_Clear(); - } + if (PyCodec_KnownEncoding(codeset)) + codeset = strdup(codeset); + else + codeset = NULL; } else codeset = NULL;