From d00f7b1b9d12dd6f29d7616217900785c4f6674d Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 15 Nov 2024 13:48:57 +0100 Subject: [PATCH] gh-125063: marshal: Add version 5, improve documentation (GH-126829) * Document that slices can be marshalled * Deduplicate and organize the list of supported types in docs * Organize the type code list in marshal.c, to make it more obvious that this is a versioned format * Back-fill some historical info Co-authored-by: Michael Droettboom --- Doc/c-api/marshal.rst | 9 +-- Doc/library/marshal.rst | 64 +++++++++++++------ Include/marshal.h | 2 +- Lib/test/test_marshal.py | 27 +++++++- ...-11-14-13-16-20.gh-issue-125063.kJ-WnH.rst | 2 + Programs/_freeze_module.c | 1 + Python/marshal.c | 46 ++++++++----- 7 files changed, 110 insertions(+), 41 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-11-14-13-16-20.gh-issue-125063.kJ-WnH.rst diff --git a/Doc/c-api/marshal.rst b/Doc/c-api/marshal.rst index b9085ad3ec3..61218a1bf6f 100644 --- a/Doc/c-api/marshal.rst +++ b/Doc/c-api/marshal.rst @@ -13,11 +13,12 @@ binary mode. Numeric values are stored with the least significant byte first. -The module supports two versions of the data format: version 0 is the -historical version, version 1 shares interned strings in the file, and upon -unmarshalling. Version 2 uses a binary format for floating-point numbers. -``Py_MARSHAL_VERSION`` indicates the current file format (currently 2). +The module supports several versions of the data format; see +the :py:mod:`Python module documentation ` for details. +.. c:macro:: Py_MARSHAL_VERSION + + The current format version. See :py:data:`marshal.version`. .. c:function:: void PyMarshal_WriteLongToFile(long value, FILE *file, int version) diff --git a/Doc/library/marshal.rst b/Doc/library/marshal.rst index 9e4606df0f7..8b14ee449d4 100644 --- a/Doc/library/marshal.rst +++ b/Doc/library/marshal.rst @@ -38,23 +38,39 @@ supports a substantially wider range of objects than marshal. maliciously constructed data. Never unmarshal data received from an untrusted or unauthenticated source. +There are functions that read/write files as well as functions operating on +bytes-like objects. + .. index:: object; code, code object Not all Python object types are supported; in general, only objects whose value is independent from a particular invocation of Python can be written and read by -this module. The following types are supported: booleans, integers, floating-point -numbers, complex numbers, strings, bytes, bytearrays, tuples, lists, sets, -frozensets, dictionaries, and code objects (if *allow_code* is true), -where it should be understood that -tuples, lists, sets, frozensets and dictionaries are only supported as long as -the values contained therein are themselves supported. The -singletons :const:`None`, :const:`Ellipsis` and :exc:`StopIteration` can also be -marshalled and unmarshalled. -For format *version* lower than 3, recursive lists, sets and dictionaries cannot -be written (see below). +this module. The following types are supported: + +* Numeric types: :class:`int`, :class:`bool`, :class:`float`, :class:`complex`. +* Strings (:class:`str`) and :class:`bytes`. + :term:`Bytes-like objects ` like :class:`bytearray` are + marshalled as :class:`!bytes`. +* Containers: :class:`tuple`, :class:`list`, :class:`set`, :class:`frozenset`, + and (since :data:`version` 5), :class:`slice`. + It should be understood that these are supported only if the values contained + therein are themselves supported. + Recursive containers are supported since :data:`version` 3. +* The singletons :const:`None`, :const:`Ellipsis` and :exc:`StopIteration`. +* :class:`code` objects, if *allow_code* is true. See note above about + version dependence. + +.. versionchanged:: 3.4 + + * Added format version 3, which supports marshalling recursive lists, sets + and dictionaries. + * Added format version 4, which supports efficient representations + of short strings. + +.. versionchanged:: next + + Added format version 5, which allows marshalling slices. -There are functions that read/write files as well as functions operating on -bytes-like objects. The module defines these functions: @@ -140,11 +156,24 @@ In addition, the following constants are defined: .. data:: version - Indicates the format that the module uses. Version 0 is the historical - format, version 1 shares interned strings and version 2 uses a binary format - for floating-point numbers. - Version 3 adds support for object instancing and recursion. - The current version is 4. + Indicates the format that the module uses. + Version 0 is the historical first version; subsequent versions + add new features. + Generally, a new version becomes the default when it is introduced. + + ======= =============== ==================================================== + Version Available since New features + ======= =============== ==================================================== + 1 Python 2.4 Sharing interned strings + ------- --------------- ---------------------------------------------------- + 2 Python 2.5 Binary representation of floats + ------- --------------- ---------------------------------------------------- + 3 Python 3.4 Support for object instancing and recursion + ------- --------------- ---------------------------------------------------- + 4 Python 3.4 Efficient representation of short strings + ------- --------------- ---------------------------------------------------- + 5 Python 3.14 Support for :class:`slice` objects + ======= =============== ==================================================== .. rubric:: Footnotes @@ -154,4 +183,3 @@ In addition, the following constants are defined: around in a self-contained form. Strictly speaking, "to marshal" means to convert some data from internal to external form (in an RPC buffer for instance) and "unmarshalling" for the reverse process. - diff --git a/Include/marshal.h b/Include/marshal.h index f8b0de80cfc..f773587bdd0 100644 --- a/Include/marshal.h +++ b/Include/marshal.h @@ -13,7 +13,7 @@ PyAPI_FUNC(PyObject *) PyMarshal_ReadObjectFromString(const char *, Py_ssize_t); PyAPI_FUNC(PyObject *) PyMarshal_WriteObjectToString(PyObject *, int); -#define Py_MARSHAL_VERSION 4 +#define Py_MARSHAL_VERSION 5 PyAPI_FUNC(long) PyMarshal_ReadLongFromFile(FILE *); PyAPI_FUNC(int) PyMarshal_ReadShortFromFile(FILE *); diff --git a/Lib/test/test_marshal.py b/Lib/test/test_marshal.py index 64ee1ba867d..93b8684c725 100644 --- a/Lib/test/test_marshal.py +++ b/Lib/test/test_marshal.py @@ -28,6 +28,13 @@ def helper(self, sample, *extra): finally: os_helper.unlink(os_helper.TESTFN) +def omit_last_byte(data): + """return data[:-1]""" + # This file's code is used in CompatibilityTestCase, + # but slices need marshal version 5. + # Avoid the slice literal. + return data[slice(0, -1)] + class IntTestCase(unittest.TestCase, HelperMixin): def test_ints(self): # Test a range of Python ints larger than the machine word size. @@ -241,7 +248,8 @@ def test_bug_5888452(self): def test_patch_873224(self): self.assertRaises(Exception, marshal.loads, b'0') self.assertRaises(Exception, marshal.loads, b'f') - self.assertRaises(Exception, marshal.loads, marshal.dumps(2**65)[:-1]) + self.assertRaises(Exception, marshal.loads, + omit_last_byte(marshal.dumps(2**65))) def test_version_argument(self): # Python 2.4.0 crashes for any call to marshal.dumps(x, y) @@ -594,6 +602,19 @@ def testNoIntern(self): s2 = sys.intern(s) self.assertNotEqual(id(s2), id(s)) +class SliceTestCase(unittest.TestCase, HelperMixin): + def test_slice(self): + for obj in ( + slice(None), slice(1), slice(1, 2), slice(1, 2, 3), + slice({'set'}, ('tuple', {'with': 'dict'}, ), self.helper.__code__) + ): + with self.subTest(obj=str(obj)): + self.helper(obj) + + for version in range(4): + with self.assertRaises(ValueError): + marshal.dumps(obj, version) + @support.cpython_only @unittest.skipUnless(_testcapi, 'requires _testcapi') class CAPI_TestCase(unittest.TestCase, HelperMixin): @@ -654,7 +675,7 @@ def test_read_last_object_from_file(self): self.assertEqual(r, obj) with open(os_helper.TESTFN, 'wb') as f: - f.write(data[:1]) + f.write(omit_last_byte(data)) with self.assertRaises(EOFError): _testcapi.pymarshal_read_last_object_from_file(os_helper.TESTFN) os_helper.unlink(os_helper.TESTFN) @@ -671,7 +692,7 @@ def test_read_object_from_file(self): self.assertEqual(p, len(data)) with open(os_helper.TESTFN, 'wb') as f: - f.write(data[:1]) + f.write(omit_last_byte(data)) with self.assertRaises(EOFError): _testcapi.pymarshal_read_object_from_file(os_helper.TESTFN) os_helper.unlink(os_helper.TESTFN) diff --git a/Misc/NEWS.d/next/Library/2024-11-14-13-16-20.gh-issue-125063.kJ-WnH.rst b/Misc/NEWS.d/next/Library/2024-11-14-13-16-20.gh-issue-125063.kJ-WnH.rst new file mode 100644 index 00000000000..5ddf41206db --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-11-14-13-16-20.gh-issue-125063.kJ-WnH.rst @@ -0,0 +1,2 @@ +:mod:`marshal` now supports :class:`slice` objects. The marshal format +version was increased to 5. diff --git a/Programs/_freeze_module.c b/Programs/_freeze_module.c index 891e4256e89..06d1ee016dc 100644 --- a/Programs/_freeze_module.c +++ b/Programs/_freeze_module.c @@ -121,6 +121,7 @@ compile_and_marshal(const char *name, const char *text) return NULL; } + assert(Py_MARSHAL_VERSION >= 5); PyObject *marshalled = PyMarshal_WriteObjectToString(code, Py_MARSHAL_VERSION); Py_CLEAR(code); if (marshalled == NULL) { diff --git a/Python/marshal.c b/Python/marshal.c index a280fbfd078..72afa4ff894 100644 --- a/Python/marshal.c +++ b/Python/marshal.c @@ -50,41 +50,52 @@ module marshal # define MAX_MARSHAL_STACK_DEPTH 2000 #endif +/* Supported types */ #define TYPE_NULL '0' #define TYPE_NONE 'N' #define TYPE_FALSE 'F' #define TYPE_TRUE 'T' #define TYPE_STOPITER 'S' #define TYPE_ELLIPSIS '.' -#define TYPE_INT 'i' -/* TYPE_INT64 is not generated anymore. - Supported for backward compatibility only. */ -#define TYPE_INT64 'I' -#define TYPE_FLOAT 'f' -#define TYPE_BINARY_FLOAT 'g' -#define TYPE_COMPLEX 'x' -#define TYPE_BINARY_COMPLEX 'y' -#define TYPE_LONG 'l' -#define TYPE_STRING 's' -#define TYPE_INTERNED 't' -#define TYPE_REF 'r' -#define TYPE_TUPLE '(' +#define TYPE_BINARY_FLOAT 'g' // Version 0 uses TYPE_FLOAT instead. +#define TYPE_BINARY_COMPLEX 'y' // Version 0 uses TYPE_COMPLEX instead. +#define TYPE_LONG 'l' // See also TYPE_INT. +#define TYPE_STRING 's' // Bytes. (Name comes from Python 2.) +#define TYPE_TUPLE '(' // See also TYPE_SMALL_TUPLE. #define TYPE_LIST '[' #define TYPE_DICT '{' #define TYPE_CODE 'c' #define TYPE_UNICODE 'u' #define TYPE_UNKNOWN '?' +// added in version 2: #define TYPE_SET '<' #define TYPE_FROZENSET '>' +// added in version 5: #define TYPE_SLICE ':' -#define FLAG_REF '\x80' /* with a type, add obj to index */ +// Remember to update the version and documentation when adding new types. +/* Special cases for unicode strings (added in version 4) */ +#define TYPE_INTERNED 't' // Version 1+ #define TYPE_ASCII 'a' #define TYPE_ASCII_INTERNED 'A' -#define TYPE_SMALL_TUPLE ')' #define TYPE_SHORT_ASCII 'z' #define TYPE_SHORT_ASCII_INTERNED 'Z' +/* Special cases for small objects */ +#define TYPE_INT 'i' // All versions. 32-bit encoding. +#define TYPE_SMALL_TUPLE ')' // Version 4+ + +/* Supported for backwards compatibility */ +#define TYPE_COMPLEX 'x' // Generated for version 0 only. +#define TYPE_FLOAT 'f' // Generated for version 0 only. +#define TYPE_INT64 'I' // Not generated any more. + +/* References (added in version 3) */ +#define TYPE_REF 'r' +#define FLAG_REF '\x80' /* with a type, add obj to index */ + + +// Error codes: #define WFERR_OK 0 #define WFERR_UNMARSHALLABLE 1 #define WFERR_NESTEDTOODEEP 2 @@ -615,6 +626,11 @@ w_complex_object(PyObject *v, char flag, WFILE *p) PyBuffer_Release(&view); } else if (PySlice_Check(v)) { + if (p->version < 5) { + w_byte(TYPE_UNKNOWN, p); + p->error = WFERR_UNMARSHALLABLE; + return; + } PySliceObject *slice = (PySliceObject *)v; W_TYPE(TYPE_SLICE, p); w_object(slice->start, p);