diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 74ebc144ab5..755d69a873c 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -350,6 +350,14 @@ struct _Py_global_strings { STRUCT_FOR_ID(write) STRUCT_FOR_ID(zipimporter) } identifiers; + struct { + PyASCIIObject _ascii; + uint8_t _data[2]; + } ascii[128]; + struct { + PyCompactUnicodeObject _latin1; + uint8_t _data[2]; + } latin1[128]; }; /* End auto-generated code */ diff --git a/Include/internal/pycore_runtime_init.h b/Include/internal/pycore_runtime_init.h index 8b1abcde11f..5ba18267aeb 100644 --- a/Include/internal/pycore_runtime_init.h +++ b/Include/internal/pycore_runtime_init.h @@ -93,26 +93,34 @@ extern "C" { _PyBytes_SIMPLE_INIT(CH, 1) \ } +#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \ + { \ + .ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \ + .length = sizeof(LITERAL) - 1, \ + .hash = -1, \ + .state = { \ + .kind = 1, \ + .compact = 1, \ + .ascii = ASCII, \ + .ready = 1, \ + }, \ + } #define _PyASCIIObject_INIT(LITERAL) \ { \ - ._ascii = { \ - .ob_base = _PyObject_IMMORTAL_INIT(&PyUnicode_Type), \ - .length = sizeof(LITERAL) - 1, \ - .hash = -1, \ - .state = { \ - .kind = 1, \ - .compact = 1, \ - .ascii = 1, \ - .ready = 1, \ - }, \ - }, \ - ._data = LITERAL, \ + ._ascii = _PyUnicode_ASCII_BASE_INIT(LITERAL, 1), \ + ._data = LITERAL \ } #define INIT_STR(NAME, LITERAL) \ ._ ## NAME = _PyASCIIObject_INIT(LITERAL) #define INIT_ID(NAME) \ ._ ## NAME = _PyASCIIObject_INIT(#NAME) - +#define _PyUnicode_LATIN1_INIT(LITERAL) \ + { \ + ._latin1 = { \ + ._base = _PyUnicode_ASCII_BASE_INIT(LITERAL, 0), \ + }, \ + ._data = LITERAL, \ + } /* The following is auto-generated by Tools/scripts/generate_global_objects.py. */ #define _Py_global_objects_INIT { \ @@ -965,6 +973,266 @@ extern "C" { INIT_ID(write), \ INIT_ID(zipimporter), \ }, \ + .ascii = { \ + _PyASCIIObject_INIT("\x00"), \ + _PyASCIIObject_INIT("\x01"), \ + _PyASCIIObject_INIT("\x02"), \ + _PyASCIIObject_INIT("\x03"), \ + _PyASCIIObject_INIT("\x04"), \ + _PyASCIIObject_INIT("\x05"), \ + _PyASCIIObject_INIT("\x06"), \ + _PyASCIIObject_INIT("\x07"), \ + _PyASCIIObject_INIT("\x08"), \ + _PyASCIIObject_INIT("\x09"), \ + _PyASCIIObject_INIT("\x0a"), \ + _PyASCIIObject_INIT("\x0b"), \ + _PyASCIIObject_INIT("\x0c"), \ + _PyASCIIObject_INIT("\x0d"), \ + _PyASCIIObject_INIT("\x0e"), \ + _PyASCIIObject_INIT("\x0f"), \ + _PyASCIIObject_INIT("\x10"), \ + _PyASCIIObject_INIT("\x11"), \ + _PyASCIIObject_INIT("\x12"), \ + _PyASCIIObject_INIT("\x13"), \ + _PyASCIIObject_INIT("\x14"), \ + _PyASCIIObject_INIT("\x15"), \ + _PyASCIIObject_INIT("\x16"), \ + _PyASCIIObject_INIT("\x17"), \ + _PyASCIIObject_INIT("\x18"), \ + _PyASCIIObject_INIT("\x19"), \ + _PyASCIIObject_INIT("\x1a"), \ + _PyASCIIObject_INIT("\x1b"), \ + _PyASCIIObject_INIT("\x1c"), \ + _PyASCIIObject_INIT("\x1d"), \ + _PyASCIIObject_INIT("\x1e"), \ + _PyASCIIObject_INIT("\x1f"), \ + _PyASCIIObject_INIT("\x20"), \ + _PyASCIIObject_INIT("\x21"), \ + _PyASCIIObject_INIT("\x22"), \ + _PyASCIIObject_INIT("\x23"), \ + _PyASCIIObject_INIT("\x24"), \ + _PyASCIIObject_INIT("\x25"), \ + _PyASCIIObject_INIT("\x26"), \ + _PyASCIIObject_INIT("\x27"), \ + _PyASCIIObject_INIT("\x28"), \ + _PyASCIIObject_INIT("\x29"), \ + _PyASCIIObject_INIT("\x2a"), \ + _PyASCIIObject_INIT("\x2b"), \ + _PyASCIIObject_INIT("\x2c"), \ + _PyASCIIObject_INIT("\x2d"), \ + _PyASCIIObject_INIT("\x2e"), \ + _PyASCIIObject_INIT("\x2f"), \ + _PyASCIIObject_INIT("\x30"), \ + _PyASCIIObject_INIT("\x31"), \ + _PyASCIIObject_INIT("\x32"), \ + _PyASCIIObject_INIT("\x33"), \ + _PyASCIIObject_INIT("\x34"), \ + _PyASCIIObject_INIT("\x35"), \ + _PyASCIIObject_INIT("\x36"), \ + _PyASCIIObject_INIT("\x37"), \ + _PyASCIIObject_INIT("\x38"), \ + _PyASCIIObject_INIT("\x39"), \ + _PyASCIIObject_INIT("\x3a"), \ + _PyASCIIObject_INIT("\x3b"), \ + _PyASCIIObject_INIT("\x3c"), \ + _PyASCIIObject_INIT("\x3d"), \ + _PyASCIIObject_INIT("\x3e"), \ + _PyASCIIObject_INIT("\x3f"), \ + _PyASCIIObject_INIT("\x40"), \ + _PyASCIIObject_INIT("\x41"), \ + _PyASCIIObject_INIT("\x42"), \ + _PyASCIIObject_INIT("\x43"), \ + _PyASCIIObject_INIT("\x44"), \ + _PyASCIIObject_INIT("\x45"), \ + _PyASCIIObject_INIT("\x46"), \ + _PyASCIIObject_INIT("\x47"), \ + _PyASCIIObject_INIT("\x48"), \ + _PyASCIIObject_INIT("\x49"), \ + _PyASCIIObject_INIT("\x4a"), \ + _PyASCIIObject_INIT("\x4b"), \ + _PyASCIIObject_INIT("\x4c"), \ + _PyASCIIObject_INIT("\x4d"), \ + _PyASCIIObject_INIT("\x4e"), \ + _PyASCIIObject_INIT("\x4f"), \ + _PyASCIIObject_INIT("\x50"), \ + _PyASCIIObject_INIT("\x51"), \ + _PyASCIIObject_INIT("\x52"), \ + _PyASCIIObject_INIT("\x53"), \ + _PyASCIIObject_INIT("\x54"), \ + _PyASCIIObject_INIT("\x55"), \ + _PyASCIIObject_INIT("\x56"), \ + _PyASCIIObject_INIT("\x57"), \ + _PyASCIIObject_INIT("\x58"), \ + _PyASCIIObject_INIT("\x59"), \ + _PyASCIIObject_INIT("\x5a"), \ + _PyASCIIObject_INIT("\x5b"), \ + _PyASCIIObject_INIT("\x5c"), \ + _PyASCIIObject_INIT("\x5d"), \ + _PyASCIIObject_INIT("\x5e"), \ + _PyASCIIObject_INIT("\x5f"), \ + _PyASCIIObject_INIT("\x60"), \ + _PyASCIIObject_INIT("\x61"), \ + _PyASCIIObject_INIT("\x62"), \ + _PyASCIIObject_INIT("\x63"), \ + _PyASCIIObject_INIT("\x64"), \ + _PyASCIIObject_INIT("\x65"), \ + _PyASCIIObject_INIT("\x66"), \ + _PyASCIIObject_INIT("\x67"), \ + _PyASCIIObject_INIT("\x68"), \ + _PyASCIIObject_INIT("\x69"), \ + _PyASCIIObject_INIT("\x6a"), \ + _PyASCIIObject_INIT("\x6b"), \ + _PyASCIIObject_INIT("\x6c"), \ + _PyASCIIObject_INIT("\x6d"), \ + _PyASCIIObject_INIT("\x6e"), \ + _PyASCIIObject_INIT("\x6f"), \ + _PyASCIIObject_INIT("\x70"), \ + _PyASCIIObject_INIT("\x71"), \ + _PyASCIIObject_INIT("\x72"), \ + _PyASCIIObject_INIT("\x73"), \ + _PyASCIIObject_INIT("\x74"), \ + _PyASCIIObject_INIT("\x75"), \ + _PyASCIIObject_INIT("\x76"), \ + _PyASCIIObject_INIT("\x77"), \ + _PyASCIIObject_INIT("\x78"), \ + _PyASCIIObject_INIT("\x79"), \ + _PyASCIIObject_INIT("\x7a"), \ + _PyASCIIObject_INIT("\x7b"), \ + _PyASCIIObject_INIT("\x7c"), \ + _PyASCIIObject_INIT("\x7d"), \ + _PyASCIIObject_INIT("\x7e"), \ + _PyASCIIObject_INIT("\x7f"), \ + }, \ + .latin1 = { \ + _PyUnicode_LATIN1_INIT("\x80"), \ + _PyUnicode_LATIN1_INIT("\x81"), \ + _PyUnicode_LATIN1_INIT("\x82"), \ + _PyUnicode_LATIN1_INIT("\x83"), \ + _PyUnicode_LATIN1_INIT("\x84"), \ + _PyUnicode_LATIN1_INIT("\x85"), \ + _PyUnicode_LATIN1_INIT("\x86"), \ + _PyUnicode_LATIN1_INIT("\x87"), \ + _PyUnicode_LATIN1_INIT("\x88"), \ + _PyUnicode_LATIN1_INIT("\x89"), \ + _PyUnicode_LATIN1_INIT("\x8a"), \ + _PyUnicode_LATIN1_INIT("\x8b"), \ + _PyUnicode_LATIN1_INIT("\x8c"), \ + _PyUnicode_LATIN1_INIT("\x8d"), \ + _PyUnicode_LATIN1_INIT("\x8e"), \ + _PyUnicode_LATIN1_INIT("\x8f"), \ + _PyUnicode_LATIN1_INIT("\x90"), \ + _PyUnicode_LATIN1_INIT("\x91"), \ + _PyUnicode_LATIN1_INIT("\x92"), \ + _PyUnicode_LATIN1_INIT("\x93"), \ + _PyUnicode_LATIN1_INIT("\x94"), \ + _PyUnicode_LATIN1_INIT("\x95"), \ + _PyUnicode_LATIN1_INIT("\x96"), \ + _PyUnicode_LATIN1_INIT("\x97"), \ + _PyUnicode_LATIN1_INIT("\x98"), \ + _PyUnicode_LATIN1_INIT("\x99"), \ + _PyUnicode_LATIN1_INIT("\x9a"), \ + _PyUnicode_LATIN1_INIT("\x9b"), \ + _PyUnicode_LATIN1_INIT("\x9c"), \ + _PyUnicode_LATIN1_INIT("\x9d"), \ + _PyUnicode_LATIN1_INIT("\x9e"), \ + _PyUnicode_LATIN1_INIT("\x9f"), \ + _PyUnicode_LATIN1_INIT("\xa0"), \ + _PyUnicode_LATIN1_INIT("\xa1"), \ + _PyUnicode_LATIN1_INIT("\xa2"), \ + _PyUnicode_LATIN1_INIT("\xa3"), \ + _PyUnicode_LATIN1_INIT("\xa4"), \ + _PyUnicode_LATIN1_INIT("\xa5"), \ + _PyUnicode_LATIN1_INIT("\xa6"), \ + _PyUnicode_LATIN1_INIT("\xa7"), \ + _PyUnicode_LATIN1_INIT("\xa8"), \ + _PyUnicode_LATIN1_INIT("\xa9"), \ + _PyUnicode_LATIN1_INIT("\xaa"), \ + _PyUnicode_LATIN1_INIT("\xab"), \ + _PyUnicode_LATIN1_INIT("\xac"), \ + _PyUnicode_LATIN1_INIT("\xad"), \ + _PyUnicode_LATIN1_INIT("\xae"), \ + _PyUnicode_LATIN1_INIT("\xaf"), \ + _PyUnicode_LATIN1_INIT("\xb0"), \ + _PyUnicode_LATIN1_INIT("\xb1"), \ + _PyUnicode_LATIN1_INIT("\xb2"), \ + _PyUnicode_LATIN1_INIT("\xb3"), \ + _PyUnicode_LATIN1_INIT("\xb4"), \ + _PyUnicode_LATIN1_INIT("\xb5"), \ + _PyUnicode_LATIN1_INIT("\xb6"), \ + _PyUnicode_LATIN1_INIT("\xb7"), \ + _PyUnicode_LATIN1_INIT("\xb8"), \ + _PyUnicode_LATIN1_INIT("\xb9"), \ + _PyUnicode_LATIN1_INIT("\xba"), \ + _PyUnicode_LATIN1_INIT("\xbb"), \ + _PyUnicode_LATIN1_INIT("\xbc"), \ + _PyUnicode_LATIN1_INIT("\xbd"), \ + _PyUnicode_LATIN1_INIT("\xbe"), \ + _PyUnicode_LATIN1_INIT("\xbf"), \ + _PyUnicode_LATIN1_INIT("\xc0"), \ + _PyUnicode_LATIN1_INIT("\xc1"), \ + _PyUnicode_LATIN1_INIT("\xc2"), \ + _PyUnicode_LATIN1_INIT("\xc3"), \ + _PyUnicode_LATIN1_INIT("\xc4"), \ + _PyUnicode_LATIN1_INIT("\xc5"), \ + _PyUnicode_LATIN1_INIT("\xc6"), \ + _PyUnicode_LATIN1_INIT("\xc7"), \ + _PyUnicode_LATIN1_INIT("\xc8"), \ + _PyUnicode_LATIN1_INIT("\xc9"), \ + _PyUnicode_LATIN1_INIT("\xca"), \ + _PyUnicode_LATIN1_INIT("\xcb"), \ + _PyUnicode_LATIN1_INIT("\xcc"), \ + _PyUnicode_LATIN1_INIT("\xcd"), \ + _PyUnicode_LATIN1_INIT("\xce"), \ + _PyUnicode_LATIN1_INIT("\xcf"), \ + _PyUnicode_LATIN1_INIT("\xd0"), \ + _PyUnicode_LATIN1_INIT("\xd1"), \ + _PyUnicode_LATIN1_INIT("\xd2"), \ + _PyUnicode_LATIN1_INIT("\xd3"), \ + _PyUnicode_LATIN1_INIT("\xd4"), \ + _PyUnicode_LATIN1_INIT("\xd5"), \ + _PyUnicode_LATIN1_INIT("\xd6"), \ + _PyUnicode_LATIN1_INIT("\xd7"), \ + _PyUnicode_LATIN1_INIT("\xd8"), \ + _PyUnicode_LATIN1_INIT("\xd9"), \ + _PyUnicode_LATIN1_INIT("\xda"), \ + _PyUnicode_LATIN1_INIT("\xdb"), \ + _PyUnicode_LATIN1_INIT("\xdc"), \ + _PyUnicode_LATIN1_INIT("\xdd"), \ + _PyUnicode_LATIN1_INIT("\xde"), \ + _PyUnicode_LATIN1_INIT("\xdf"), \ + _PyUnicode_LATIN1_INIT("\xe0"), \ + _PyUnicode_LATIN1_INIT("\xe1"), \ + _PyUnicode_LATIN1_INIT("\xe2"), \ + _PyUnicode_LATIN1_INIT("\xe3"), \ + _PyUnicode_LATIN1_INIT("\xe4"), \ + _PyUnicode_LATIN1_INIT("\xe5"), \ + _PyUnicode_LATIN1_INIT("\xe6"), \ + _PyUnicode_LATIN1_INIT("\xe7"), \ + _PyUnicode_LATIN1_INIT("\xe8"), \ + _PyUnicode_LATIN1_INIT("\xe9"), \ + _PyUnicode_LATIN1_INIT("\xea"), \ + _PyUnicode_LATIN1_INIT("\xeb"), \ + _PyUnicode_LATIN1_INIT("\xec"), \ + _PyUnicode_LATIN1_INIT("\xed"), \ + _PyUnicode_LATIN1_INIT("\xee"), \ + _PyUnicode_LATIN1_INIT("\xef"), \ + _PyUnicode_LATIN1_INIT("\xf0"), \ + _PyUnicode_LATIN1_INIT("\xf1"), \ + _PyUnicode_LATIN1_INIT("\xf2"), \ + _PyUnicode_LATIN1_INIT("\xf3"), \ + _PyUnicode_LATIN1_INIT("\xf4"), \ + _PyUnicode_LATIN1_INIT("\xf5"), \ + _PyUnicode_LATIN1_INIT("\xf6"), \ + _PyUnicode_LATIN1_INIT("\xf7"), \ + _PyUnicode_LATIN1_INIT("\xf8"), \ + _PyUnicode_LATIN1_INIT("\xf9"), \ + _PyUnicode_LATIN1_INIT("\xfa"), \ + _PyUnicode_LATIN1_INIT("\xfb"), \ + _PyUnicode_LATIN1_INIT("\xfc"), \ + _PyUnicode_LATIN1_INIT("\xfd"), \ + _PyUnicode_LATIN1_INIT("\xfe"), \ + _PyUnicode_LATIN1_INIT("\xff"), \ + }, \ }, \ \ .tuple_empty = { \ diff --git a/Include/internal/pycore_unicodeobject.h b/Include/internal/pycore_unicodeobject.h index 977bbeb1917..4394ce939b5 100644 --- a/Include/internal/pycore_unicodeobject.h +++ b/Include/internal/pycore_unicodeobject.h @@ -44,9 +44,6 @@ struct _Py_unicode_ids { }; struct _Py_unicode_state { - /* Single character Unicode strings in the Latin-1 range are being - shared as well. */ - PyObject *latin1[256]; struct _Py_unicode_fs_codec fs_codec; // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId() diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-03-03-09-08-17.bpo-46881.ckD4tT.rst b/Misc/NEWS.d/next/Core and Builtins/2022-03-03-09-08-17.bpo-46881.ckD4tT.rst new file mode 100644 index 00000000000..88434dd1dba --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-03-03-09-08-17.bpo-46881.ckD4tT.rst @@ -0,0 +1 @@ +Statically allocate and initialize the latin1 characters. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 908ad514925..9052c53f11b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -206,6 +206,11 @@ extern "C" { *_to++ = (to_type) *_iter++; \ } while (0) +#define LATIN1(ch) \ + (ch < 128 \ + ? (PyObject*)&_Py_SINGLETON(strings).ascii[ch] \ + : (PyObject*)&_Py_SINGLETON(strings).latin1[ch - 128]) + #ifdef MS_WINDOWS /* On Windows, overallocate by 50% is the best factor */ # define OVERALLOCATE_FACTOR 2 @@ -249,14 +254,6 @@ static int unicode_is_singleton(PyObject *unicode); #endif -static struct _Py_unicode_state* -get_unicode_state(void) -{ - PyInterpreterState *interp = _PyInterpreterState_GET(); - return &interp->unicode; -} - - // Return a borrowed reference to the empty string singleton. static inline PyObject* unicode_get_empty(void) { @@ -680,24 +677,10 @@ unicode_result_ready(PyObject *unicode) if (kind == PyUnicode_1BYTE_KIND) { const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode); Py_UCS1 ch = data[0]; - struct _Py_unicode_state *state = get_unicode_state(); - PyObject *latin1_char = state->latin1[ch]; - if (latin1_char != NULL) { - if (unicode != latin1_char) { - Py_INCREF(latin1_char); - Py_DECREF(unicode); - } - return latin1_char; + if (unicode != LATIN1(ch)) { + Py_DECREF(unicode); } - else { - assert(_PyUnicode_CheckConsistency(unicode, 1)); - Py_INCREF(unicode); - state->latin1[ch] = unicode; - return unicode; - } - } - else { - assert(PyUnicode_READ_CHAR(unicode, 0) >= 256); + return get_latin1_char(ch); } } @@ -1990,11 +1973,10 @@ unicode_is_singleton(PyObject *unicode) return 1; } - struct _Py_unicode_state *state = get_unicode_state(); PyASCIIObject *ascii = (PyASCIIObject *)unicode; if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1) { Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0); - if (ch < 256 && state->latin1[ch] == unicode) { + if (ch < 256 && LATIN1(ch) == unicode) { return 1; } } @@ -2137,25 +2119,7 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index, static PyObject* get_latin1_char(Py_UCS1 ch) { - struct _Py_unicode_state *state = get_unicode_state(); - - PyObject *unicode = state->latin1[ch]; - if (unicode) { - Py_INCREF(unicode); - return unicode; - } - - unicode = PyUnicode_New(1, ch); - if (!unicode) { - return NULL; - } - - PyUnicode_1BYTE_DATA(unicode)[0] = ch; - assert(_PyUnicode_CheckConsistency(unicode, 1)); - - Py_INCREF(unicode); - state->latin1[ch] = unicode; - return unicode; + return Py_NewRef(LATIN1(ch)); } static PyObject* @@ -15535,6 +15499,10 @@ _PyUnicode_InitGlobalObjects(PyInterpreterState *interp) #ifdef Py_DEBUG assert(_PyUnicode_CheckConsistency(&_Py_STR(empty), 1)); + + for (int i = 0; i < 256; i++) { + assert(_PyUnicode_CheckConsistency(LATIN1(i), 1)); + } #endif return _PyStatus_OK(); @@ -16113,10 +16081,6 @@ _PyUnicode_Fini(PyInterpreterState *interp) _PyUnicode_FiniEncodings(&state->fs_codec); unicode_clear_identifiers(state); - - for (Py_ssize_t i = 0; i < 256; i++) { - Py_CLEAR(state->latin1[i]); - } } diff --git a/Tools/scripts/generate_global_objects.py b/Tools/scripts/generate_global_objects.py index 867358cda89..17ddb8b324a 100644 --- a/Tools/scripts/generate_global_objects.py +++ b/Tools/scripts/generate_global_objects.py @@ -196,6 +196,13 @@ def generate_global_strings(identifiers, strings): for name in sorted(identifiers): assert name.isidentifier(), name printer.write(f'STRUCT_FOR_ID({name})') + with printer.block('struct', ' ascii[128];'): + printer.write("PyASCIIObject _ascii;") + printer.write("uint8_t _data[2];") + with printer.block('struct', ' latin1[128];'): + printer.write("PyCompactUnicodeObject _latin1;") + printer.write("uint8_t _data[2];") + printer.write(END) printer.write(after) @@ -252,6 +259,12 @@ def generate_runtime_init(identifiers, strings): for name in sorted(identifiers): assert name.isidentifier(), name printer.write(f'INIT_ID({name}),') + with printer.block('.ascii =', ','): + for i in range(128): + printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),') + with printer.block('.latin1 =', ','): + for i in range(128, 256): + printer.write(f'_PyUnicode_LATIN1_INIT("\\x{i:02x}"),') printer.write('') with printer.block('.tuple_empty =', ','): printer.write('.ob_base = _PyVarObject_IMMORTAL_INIT(&PyTuple_Type, 0)')