From 2fc8f773e15863fcde881e7e52a5c64896baa5df Mon Sep 17 00:00:00 2001 From: Georg Brandl Date: Sun, 2 Mar 2014 09:18:31 +0100 Subject: [PATCH] Issue #20404: reject non-text encodings early in TextIOWrapper. --- Include/codecs.h | 20 +++++++++++ Lib/_pyio.py | 5 +++ Lib/test/test_io.py | 30 ++++++++++++---- Modules/_io/textio.c | 34 +++++++++++------- Python/codecs.c | 84 +++++++++++++++++++++++++++++++++----------- 5 files changed, 134 insertions(+), 39 deletions(-) diff --git a/Include/codecs.h b/Include/codecs.h index 8f0014e2377..611964ce4b3 100644 --- a/Include/codecs.h +++ b/Include/codecs.h @@ -104,7 +104,14 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode( Please note that these APIs are internal and should not be used in Python C extensions. + XXX (ncoghlan): should we make these, or something like them, public + in Python 3.5+? + */ +PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding( + const char *encoding, + const char *alternate_command + ); PyAPI_FUNC(PyObject *) _PyCodec_EncodeText( PyObject *object, @@ -117,6 +124,19 @@ PyAPI_FUNC(PyObject *) _PyCodec_DecodeText( const char *encoding, const char *errors ); + +/* These two aren't actually text encoding specific, but _io.TextIOWrapper + * is the only current API consumer. + */ +PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder( + PyObject *codec_info, + const char *errors + ); + +PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder( + PyObject *codec_info, + const char *errors + ); #endif diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 9a2a1aa69c3..a0c4b259770 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1495,6 +1495,11 @@ def __init__(self, buffer, encoding=None, errors=None, newline=None, if not isinstance(encoding, str): raise ValueError("invalid encoding: %r" % encoding) + if not codecs.lookup(encoding)._is_text_encoding: + msg = ("%r is not a text encoding; " + "use codecs.open() to handle arbitrary codecs") + raise LookupError(msg % encoding) + if errors is None: errors = "strict" else: diff --git a/Lib/test/test_io.py b/Lib/test/test_io.py index ac6d478e32f..2a96b7b8e79 100644 --- a/Lib/test/test_io.py +++ b/Lib/test/test_io.py @@ -1955,6 +1955,15 @@ def test_constructor(self): self.assertRaises(TypeError, t.__init__, b, newline=42) self.assertRaises(ValueError, t.__init__, b, newline='xyzzy') + def test_non_text_encoding_codecs_are_rejected(self): + # Ensure the constructor complains if passed a codec that isn't + # marked as a text encoding + # http://bugs.python.org/issue20404 + r = self.BytesIO() + b = self.BufferedWriter(r) + with self.assertRaisesRegex(LookupError, "is not a text encoding"): + self.TextIOWrapper(b, encoding="hex_codec") + def test_detach(self): r = self.BytesIO() b = self.BufferedWriter(r) @@ -2607,15 +2616,22 @@ def test_read_nonbytes(self): def test_illegal_decoder(self): # Issue #17106 + # Bypass the early encoding check added in issue 20404 + def _make_illegal_wrapper(): + quopri = codecs.lookup("quopri_codec") + quopri._is_text_encoding = True + try: + t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), + newline='\n', encoding="quopri_codec") + finally: + quopri._is_text_encoding = False + return t # Crash when decoder returns non-string - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.read, 1) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.readline) - t = self.TextIOWrapper(self.BytesIO(b'aaaaaa'), newline='\n', - encoding='quopri_codec') + t = _make_illegal_wrapper() self.assertRaises(TypeError, t.read) @@ -3053,6 +3069,7 @@ def test_open_allargs(self): class CMiscIOTest(MiscIOTest): io = io + shutdown_error = "RuntimeError: could not find io module state" def test_readinto_buffer_overflow(self): # Issue #18025 @@ -3065,6 +3082,7 @@ def read(self, n=-1): class PyMiscIOTest(MiscIOTest): io = pyio + shutdown_error = "LookupError: unknown encoding: ascii" @unittest.skipIf(os.name == 'nt', 'POSIX signals required for this test.') diff --git a/Modules/_io/textio.c b/Modules/_io/textio.c index 603ee60fa93..e8f9984c909 100644 --- a/Modules/_io/textio.c +++ b/Modules/_io/textio.c @@ -836,7 +836,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) char *kwlist[] = {"buffer", "encoding", "errors", "newline", "line_buffering", "write_through", NULL}; - PyObject *buffer, *raw; + PyObject *buffer, *raw, *codec_info = NULL; char *encoding = NULL; char *errors = NULL; char *newline = NULL; @@ -951,6 +951,17 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) "could not determine default encoding"); } + /* Check we have been asked for a real text encoding */ + codec_info = _PyCodec_LookupTextEncoding(encoding, "codecs.open()"); + if (codec_info == NULL) { + Py_CLEAR(self->encoding); + goto error; + } + + /* XXX: Failures beyond this point have the potential to leak elements + * of the partially constructed object (like self->encoding) + */ + if (errors == NULL) errors = "strict"; self->errors = PyBytes_FromString(errors); @@ -965,7 +976,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (newline) { self->readnl = PyUnicode_FromString(newline); if (self->readnl == NULL) - return -1; + goto error; } self->writetranslate = (newline == NULL || newline[0] != '\0'); if (!self->readuniversal && self->readnl) { @@ -989,8 +1000,8 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (r == -1) goto error; if (r == 1) { - self->decoder = PyCodec_IncrementalDecoder( - encoding, errors); + self->decoder = _PyCodecInfo_GetIncrementalDecoder(codec_info, + errors); if (self->decoder == NULL) goto error; @@ -1014,17 +1025,12 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) if (r == -1) goto error; if (r == 1) { - PyObject *ci; - self->encoder = PyCodec_IncrementalEncoder( - encoding, errors); + self->encoder = _PyCodecInfo_GetIncrementalEncoder(codec_info, + errors); if (self->encoder == NULL) goto error; /* Get the normalized named of the codec */ - ci = _PyCodec_Lookup(encoding); - if (ci == NULL) - goto error; - res = _PyObject_GetAttrId(ci, &PyId_name); - Py_DECREF(ci); + res = _PyObject_GetAttrId(codec_info, &PyId_name); if (res == NULL) { if (PyErr_ExceptionMatches(PyExc_AttributeError)) PyErr_Clear(); @@ -1044,6 +1050,9 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) Py_XDECREF(res); } + /* Finished sorting out the codec details */ + Py_DECREF(codec_info); + self->buffer = buffer; Py_INCREF(buffer); @@ -1106,6 +1115,7 @@ textiowrapper_init(textio *self, PyObject *args, PyObject *kwds) return 0; error: + Py_XDECREF(codec_info); return -1; } diff --git a/Python/codecs.c b/Python/codecs.c index 5ebc4cb5f6e..0b736c1715b 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -243,20 +243,15 @@ PyObject *codec_getitem(const char *encoding, int index) return v; } -/* Helper function to create an incremental codec. */ - +/* Helper functions to create an incremental codec. */ static -PyObject *codec_getincrementalcodec(const char *encoding, - const char *errors, - const char *attrname) +PyObject *codec_makeincrementalcodec(PyObject *codec_info, + const char *errors, + const char *attrname) { - PyObject *codecs, *ret, *inccodec; + PyObject *ret, *inccodec; - codecs = _PyCodec_Lookup(encoding); - if (codecs == NULL) - return NULL; - inccodec = PyObject_GetAttrString(codecs, attrname); - Py_DECREF(codecs); + inccodec = PyObject_GetAttrString(codec_info, attrname); if (inccodec == NULL) return NULL; if (errors) @@ -267,6 +262,21 @@ PyObject *codec_getincrementalcodec(const char *encoding, return ret; } +static +PyObject *codec_getincrementalcodec(const char *encoding, + const char *errors, + const char *attrname) +{ + PyObject *codec_info, *ret; + + codec_info = _PyCodec_Lookup(encoding); + if (codec_info == NULL) + return NULL; + ret = codec_makeincrementalcodec(codec_info, errors, attrname); + Py_DECREF(codec_info); + return ret; +} + /* Helper function to create a stream codec. */ static @@ -290,6 +300,24 @@ PyObject *codec_getstreamcodec(const char *encoding, return streamcodec; } +/* Helpers to work with the result of _PyCodec_Lookup + + */ +PyObject *_PyCodecInfo_GetIncrementalDecoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementaldecoder"); +} + +PyObject *_PyCodecInfo_GetIncrementalEncoder(PyObject *codec_info, + const char *errors) +{ + return codec_makeincrementalcodec(codec_info, errors, + "incrementalencoder"); +} + + /* Convenience APIs to query the Codec registry. All APIs return a codec object with incremented refcount. @@ -447,15 +475,12 @@ PyObject *PyCodec_Decode(PyObject *object, } /* Text encoding/decoding API */ -static -PyObject *codec_getitem_checked(const char *encoding, - const char *operation_name, - int index) +PyObject * _PyCodec_LookupTextEncoding(const char *encoding, + const char *alternate_command) { _Py_IDENTIFIER(_is_text_encoding); PyObject *codec; PyObject *attr; - PyObject *v; int is_text_codec; codec = _PyCodec_Lookup(encoding); @@ -482,27 +507,44 @@ PyObject *codec_getitem_checked(const char *encoding, Py_DECREF(codec); PyErr_Format(PyExc_LookupError, "'%.400s' is not a text encoding; " - "use codecs.%s() to handle arbitrary codecs", - encoding, operation_name); + "use %s to handle arbitrary codecs", + encoding, alternate_command); return NULL; } } } + /* This appears to be a valid text encoding */ + return codec; +} + + +static +PyObject *codec_getitem_checked(const char *encoding, + const char *alternate_command, + int index) +{ + PyObject *codec; + PyObject *v; + + codec = _PyCodec_LookupTextEncoding(encoding, alternate_command); + if (codec == NULL) + return NULL; + v = PyTuple_GET_ITEM(codec, index); - Py_DECREF(codec); Py_INCREF(v); + Py_DECREF(codec); return v; } static PyObject * _PyCodec_TextEncoder(const char *encoding) { - return codec_getitem_checked(encoding, "encode", 0); + return codec_getitem_checked(encoding, "codecs.encode()", 0); } static PyObject * _PyCodec_TextDecoder(const char *encoding) { - return codec_getitem_checked(encoding, "decode", 1); + return codec_getitem_checked(encoding, "codecs.decode()", 1); } PyObject *_PyCodec_EncodeText(PyObject *object,