bpo-36142: Add _PyPreConfig.utf8_mode (GH-12174)

* Move following fields from _PyCoreConfig to _PyPreConfig:

  * coerce_c_locale
  * coerce_c_locale_warn
  * legacy_windows_stdio
  * utf8_mode

* _PyPreConfig_ReadFromArgv() is now responsible to choose the
  filesystem encoding
* _PyPreConfig_Write() now sets the LC_CTYPE locale
This commit is contained in:
Victor Stinner 2019-03-05 12:32:09 +01:00 committed by GitHub
parent 5b10b98247
commit 5a02e0d1c8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 466 additions and 367 deletions

View File

@ -60,12 +60,42 @@ typedef struct {
Set to 0 by -E command line option. If set to -1 (default), it is
set to !Py_IgnoreEnvironmentFlag. */
int use_environment;
int coerce_c_locale; /* PYTHONCOERCECLOCALE, -1 means unknown */
int coerce_c_locale_warn; /* PYTHONCOERCECLOCALE=warn */
#ifdef MS_WINDOWS
/* If greater than 1, use the "mbcs" encoding instead of the UTF-8
encoding for the filesystem encoding.
Set to 1 if the PYTHONLEGACYWINDOWSFSENCODING environment variable is
set to a non-empty string. If set to -1 (default), inherit
Py_LegacyWindowsFSEncodingFlag value.
See PEP 529 for more details. */
int legacy_windows_fs_encoding;
#endif
/* Enable UTF-8 mode?
Set by -X utf8 command line option and PYTHONUTF8 environment variable.
If set to -1 (default), inherit Py_UTF8Mode value. */
int utf8_mode;
} _PyPreConfig;
#ifdef MS_WINDOWS
# define _PyPreConfig_WINDOWS_INIT \
.legacy_windows_fs_encoding = -1,
#else
# define _PyPreConfig_WINDOWS_INIT
#endif
#define _PyPreConfig_INIT \
(_PyPreConfig){ \
_PyPreConfig_WINDOWS_INIT \
.isolated = -1, \
.use_environment = -1}
.use_environment = -1, \
.coerce_c_locale = -1, \
.utf8_mode = -1}
/* --- _PyCoreConfig ---------------------------------------------- */
@ -95,8 +125,6 @@ typedef struct {
int show_alloc_count; /* -X showalloccount */
int dump_refs; /* PYTHONDUMPREFS */
int malloc_stats; /* PYTHONMALLOCSTATS */
int coerce_c_locale; /* PYTHONCOERCECLOCALE, -1 means unknown */
int coerce_c_locale_warn; /* PYTHONCOERCECLOCALE=warn */
/* Python filesystem encoding and error handler:
sys.getfilesystemencoding() and sys.getfilesystemencodeerrors().
@ -134,11 +162,6 @@ typedef struct {
char *filesystem_encoding;
char *filesystem_errors;
/* Enable UTF-8 mode?
Set by -X utf8 command line option and PYTHONUTF8 environment variable.
If set to -1 (default), inherit Py_UTF8Mode value. */
int utf8_mode;
wchar_t *pycache_prefix; /* PYTHONPYCACHEPREFIX, -X pycache_prefix=PATH */
wchar_t *program_name; /* Program name, see also Py_GetProgramName() */
@ -277,16 +300,6 @@ typedef struct {
char *stdio_errors;
#ifdef MS_WINDOWS
/* If greater than 1, use the "mbcs" encoding instead of the UTF-8
encoding for the filesystem encoding.
Set to 1 if the PYTHONLEGACYWINDOWSFSENCODING environment variable is
set to a non-empty string. If set to -1 (default), inherit
Py_LegacyWindowsFSEncodingFlag value.
See PEP 529 for more details. */
int legacy_windows_fs_encoding;
/* If greater than zero, use io.FileIO instead of WindowsConsoleIO for sys
standard streams.
@ -340,7 +353,6 @@ typedef struct {
#ifdef MS_WINDOWS
# define _PyCoreConfig_WINDOWS_INIT \
.legacy_windows_fs_encoding = -1, \
.legacy_windows_stdio = -1,
#else
# define _PyCoreConfig_WINDOWS_INIT
@ -348,13 +360,12 @@ typedef struct {
#define _PyCoreConfig_INIT \
(_PyCoreConfig){ \
_PyCoreConfig_WINDOWS_INIT \
.preconfig = _PyPreConfig_INIT, \
.install_signal_handlers = 1, \
.use_hash_seed = -1, \
.faulthandler = -1, \
.tracemalloc = -1, \
.coerce_c_locale = -1, \
.utf8_mode = -1, \
.argc = -1, \
.nmodule_search_path = -1, \
.site_import = -1, \
@ -368,7 +379,6 @@ typedef struct {
.quiet = -1, \
.user_site_directory = -1, \
.buffered_stdio = -1, \
_PyCoreConfig_WINDOWS_INIT \
._install_importlib = 1, \
._check_hash_pycs_mode = "default", \
._frozen = -1}

View File

@ -36,11 +36,24 @@ PyAPI_FUNC(int) _Py_SetArgcArgv(int argc, wchar_t * const *argv);
/* --- _PyPreConfig ----------------------------------------------- */
PyAPI_FUNC(int) _Py_str_to_int(
const char *str,
int *result);
PyAPI_FUNC(const wchar_t*) _Py_get_xoption(
int nxoption,
wchar_t * const *xoptions,
const wchar_t *name);
PyAPI_FUNC(void) _PyPreConfig_Clear(_PyPreConfig *config);
PyAPI_FUNC(int) _PyPreConfig_Copy(_PyPreConfig *config,
const _PyPreConfig *config2);
PyAPI_FUNC(void) _PyPreConfig_GetGlobalConfig(_PyPreConfig *config);
PyAPI_FUNC(void) _PyPreConfig_SetGlobalConfig(const _PyPreConfig *config);
PyAPI_FUNC(const char*) _PyPreConfig_GetEnv(const _PyPreConfig *config,
const char *name);
PyAPI_FUNC(void) _Py_get_env_flag(_PyPreConfig *config,
int *flag,
const char *name);
PyAPI_FUNC(_PyInitError) _PyPreConfig_Read(_PyPreConfig *config);
PyAPI_FUNC(int) _PyPreConfig_AsDict(const _PyPreConfig *config,
PyObject *dict);

View File

@ -461,7 +461,7 @@ static int test_init_from_config(void)
putenv("PYTHONUTF8=0");
Py_UTF8Mode = 0;
config.utf8_mode = 1;
config.preconfig.utf8_mode = 1;
putenv("PYTHONPYCACHEPREFIX=env_pycache_prefix");
config.pycache_prefix = L"conf_pycache_prefix";
@ -610,8 +610,8 @@ static int test_init_isolated(void)
config.preconfig.isolated = 1;
/* Set coerce_c_locale and utf8_mode to not depend on the locale */
config.coerce_c_locale = 0;
config.utf8_mode = 0;
config.preconfig.coerce_c_locale = 0;
config.preconfig.utf8_mode = 0;
/* Use path starting with "./" avoids a search along the PATH */
config.program_name = L"./_testembed";

View File

@ -531,10 +531,6 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2)
COPY_ATTR(dump_refs);
COPY_ATTR(malloc_stats);
COPY_ATTR(coerce_c_locale);
COPY_ATTR(coerce_c_locale_warn);
COPY_ATTR(utf8_mode);
COPY_WSTR_ATTR(pycache_prefix);
COPY_WSTR_ATTR(module_search_path_env);
COPY_WSTR_ATTR(home);
@ -571,7 +567,6 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2)
COPY_STR_ATTR(stdio_encoding);
COPY_STR_ATTR(stdio_errors);
#ifdef MS_WINDOWS
COPY_ATTR(legacy_windows_fs_encoding);
COPY_ATTR(legacy_windows_stdio);
#endif
COPY_ATTR(skip_source_first_line);
@ -592,19 +587,7 @@ _PyCoreConfig_Copy(_PyCoreConfig *config, const _PyCoreConfig *config2)
const char*
_PyCoreConfig_GetEnv(const _PyCoreConfig *config, const char *name)
{
assert(config->preconfig.use_environment >= 0);
if (!config->preconfig.use_environment) {
return NULL;
}
const char *var = getenv(name);
if (var && var[0] != '\0') {
return var;
}
else {
return NULL;
}
return _PyPreConfig_GetEnv(&config->preconfig, name);
}
@ -670,7 +653,6 @@ _PyCoreConfig_GetGlobalConfig(_PyCoreConfig *config)
config->ATTR = !(VALUE); \
}
COPY_FLAG(utf8_mode, Py_UTF8Mode);
COPY_FLAG(bytes_warning, Py_BytesWarningFlag);
COPY_FLAG(inspect, Py_InspectFlag);
COPY_FLAG(interactive, Py_InteractiveFlag);
@ -679,7 +661,6 @@ _PyCoreConfig_GetGlobalConfig(_PyCoreConfig *config)
COPY_FLAG(verbose, Py_VerboseFlag);
COPY_FLAG(quiet, Py_QuietFlag);
#ifdef MS_WINDOWS
COPY_FLAG(legacy_windows_fs_encoding, Py_LegacyWindowsFSEncodingFlag);
COPY_FLAG(legacy_windows_stdio, Py_LegacyWindowsStdioFlag);
#endif
COPY_FLAG(_frozen, Py_FrozenFlag);
@ -709,7 +690,6 @@ _PyCoreConfig_SetGlobalConfig(const _PyCoreConfig *config)
VAR = !config->ATTR; \
}
COPY_FLAG(utf8_mode, Py_UTF8Mode);
COPY_FLAG(bytes_warning, Py_BytesWarningFlag);
COPY_FLAG(inspect, Py_InspectFlag);
COPY_FLAG(interactive, Py_InteractiveFlag);
@ -718,7 +698,6 @@ _PyCoreConfig_SetGlobalConfig(const _PyCoreConfig *config)
COPY_FLAG(verbose, Py_VerboseFlag);
COPY_FLAG(quiet, Py_QuietFlag);
#ifdef MS_WINDOWS
COPY_FLAG(legacy_windows_fs_encoding, Py_LegacyWindowsFSEncodingFlag);
COPY_FLAG(legacy_windows_stdio, Py_LegacyWindowsStdioFlag);
#endif
COPY_FLAG(_frozen, Py_FrozenFlag);
@ -838,23 +817,7 @@ config_init_executable(_PyCoreConfig *config)
static const wchar_t*
config_get_xoption(const _PyCoreConfig *config, wchar_t *name)
{
int nxoption = config->nxoption;
wchar_t **xoptions = config->xoptions;
for (int i=0; i < nxoption; i++) {
wchar_t *option = xoptions[i];
size_t len;
wchar_t *sep = wcschr(option, L'=');
if (sep != NULL) {
len = (sep - option);
}
else {
len = wcslen(option);
}
if (wcsncmp(option, name, len) == 0 && name[len] == L'\0') {
return option;
}
}
return NULL;
return _Py_get_xoption(config->nxoption, config->xoptions, name);
}
@ -915,67 +878,6 @@ config_init_hash_seed(_PyCoreConfig *config)
}
static _PyInitError
config_init_utf8_mode(_PyCoreConfig *config)
{
const wchar_t *xopt = config_get_xoption(config, L"utf8");
if (xopt) {
wchar_t *sep = wcschr(xopt, L'=');
if (sep) {
xopt = sep + 1;
if (wcscmp(xopt, L"1") == 0) {
config->utf8_mode = 1;
}
else if (wcscmp(xopt, L"0") == 0) {
config->utf8_mode = 0;
}
else {
return _Py_INIT_USER_ERR("invalid -X utf8 option value");
}
}
else {
config->utf8_mode = 1;
}
return _Py_INIT_OK();
}
const char *opt = _PyCoreConfig_GetEnv(config, "PYTHONUTF8");
if (opt) {
if (strcmp(opt, "1") == 0) {
config->utf8_mode = 1;
}
else if (strcmp(opt, "0") == 0) {
config->utf8_mode = 0;
}
else {
return _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment "
"variable value");
}
return _Py_INIT_OK();
}
return _Py_INIT_OK();
}
static int
config_str_to_int(const char *str, int *result)
{
const char *endptr = str;
errno = 0;
long value = strtol(str, (char **)&endptr, 10);
if (*endptr != '\0' || errno == ERANGE) {
return -1;
}
if (value < INT_MIN || value > INT_MAX) {
return -1;
}
*result = (int)value;
return 0;
}
static int
config_wstr_to_int(const wchar_t *wstr, int *result)
{
@ -994,27 +896,12 @@ config_wstr_to_int(const wchar_t *wstr, int *result)
}
static void
get_env_flag(_PyCoreConfig *config, int *flag, const char *name)
{
const char *var = _PyCoreConfig_GetEnv(config, name);
if (!var) {
return;
}
int value;
if (config_str_to_int(var, &value) < 0 || value < 0) {
/* PYTHONDEBUG=text and PYTHONDEBUG=-2 behave as PYTHONDEBUG=1 */
value = 1;
}
if (*flag < value) {
*flag = value;
}
}
static _PyInitError
config_read_env_vars(_PyCoreConfig *config)
{
#define get_env_flag(CONFIG, ATTR, NAME) \
_Py_get_env_flag(&(CONFIG)->preconfig, (ATTR), (NAME))
/* Get environment variables */
get_env_flag(config, &config->parser_debug, "PYTHONDEBUG");
get_env_flag(config, &config->verbose, "PYTHONVERBOSE");
@ -1040,8 +927,6 @@ config_read_env_vars(_PyCoreConfig *config)
}
#ifdef MS_WINDOWS
get_env_flag(config, &config->legacy_windows_fs_encoding,
"PYTHONLEGACYWINDOWSFSENCODING");
get_env_flag(config, &config->legacy_windows_stdio,
"PYTHONLEGACYWINDOWSSTDIO");
#endif
@ -1057,23 +942,6 @@ config_read_env_vars(_PyCoreConfig *config)
config->malloc_stats = 1;
}
const char *env = _PyCoreConfig_GetEnv(config, "PYTHONCOERCECLOCALE");
if (env) {
if (strcmp(env, "0") == 0) {
if (config->coerce_c_locale < 0) {
config->coerce_c_locale = 0;
}
}
else if (strcmp(env, "warn") == 0) {
config->coerce_c_locale_warn = 1;
}
else {
if (config->coerce_c_locale < 0) {
config->coerce_c_locale = 1;
}
}
}
wchar_t *path;
int res = _PyCoreConfig_GetEnvDup(config, &path,
L"PYTHONPATH", "PYTHONPATH");
@ -1090,6 +958,8 @@ config_read_env_vars(_PyCoreConfig *config)
}
return _Py_INIT_OK();
#undef get_env_flag
}
@ -1101,7 +971,7 @@ config_init_tracemalloc(_PyCoreConfig *config)
const char *env = _PyCoreConfig_GetEnv(config, "PYTHONTRACEMALLOC");
if (env) {
if (!config_str_to_int(env, &nframe)) {
if (!_Py_str_to_int(env, &nframe)) {
valid = (nframe >= 0);
}
else {
@ -1213,37 +1083,6 @@ config_read_complex_options(_PyCoreConfig *config)
}
static void
config_init_locale(_PyCoreConfig *config)
{
/* Test also if coerce_c_locale equals 1: PYTHONCOERCECLOCALE=1 doesn't
imply that the C locale is always coerced. It is only coerced if
if the LC_CTYPE locale is "C". */
if (config->coerce_c_locale != 0) {
/* The C locale enables the C locale coercion (PEP 538) */
if (_Py_LegacyLocaleDetected()) {
config->coerce_c_locale = 1;
}
else {
config->coerce_c_locale = 0;
}
}
#ifndef MS_WINDOWS
if (config->utf8_mode < 0) {
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
if (ctype_loc != NULL
&& (strcmp(ctype_loc, "C") == 0
|| strcmp(ctype_loc, "POSIX") == 0))
{
config->utf8_mode = 1;
}
}
#endif
}
static const char *
get_stdio_errors(const _PyCoreConfig *config)
{
@ -1365,7 +1204,7 @@ config_init_stdio_encoding(_PyCoreConfig *config)
}
/* UTF-8 Mode uses UTF-8/surrogateescape */
if (config->utf8_mode) {
if (config->preconfig.utf8_mode) {
if (config->stdio_encoding == NULL) {
config->stdio_encoding = _PyMem_RawStrdup("utf-8");
if (config->stdio_encoding == NULL) {
@ -1403,7 +1242,7 @@ static _PyInitError
config_init_fs_encoding(_PyCoreConfig *config)
{
#ifdef MS_WINDOWS
if (config->legacy_windows_fs_encoding) {
if (config->preconfig.legacy_windows_fs_encoding) {
/* Legacy Windows filesystem encoding: mbcs/replace */
if (config->filesystem_encoding == NULL) {
config->filesystem_encoding = _PyMem_RawStrdup("mbcs");
@ -1438,7 +1277,7 @@ config_init_fs_encoding(_PyCoreConfig *config)
}
#else
if (config->filesystem_encoding == NULL) {
if (config->utf8_mode) {
if (config->preconfig.utf8_mode) {
/* UTF-8 Mode use: utf-8/surrogateescape */
config->filesystem_encoding = _PyMem_RawStrdup("utf-8");
/* errors defaults to surrogateescape above */
@ -1539,12 +1378,6 @@ _PyCoreConfig_Read(_PyCoreConfig *config, const _PyPreConfig *preconfig)
config->user_site_directory = 0;
}
#ifdef MS_WINDOWS
if (config->legacy_windows_fs_encoding) {
config->utf8_mode = 0;
}
#endif
if (config->preconfig.use_environment) {
err = config_read_env_vars(config);
if (_Py_INIT_FAILED(err)) {
@ -1565,13 +1398,6 @@ _PyCoreConfig_Read(_PyCoreConfig *config, const _PyPreConfig *preconfig)
return err;
}
if (config->utf8_mode < 0) {
err = config_init_utf8_mode(config);
if (_Py_INIT_FAILED(err)) {
return err;
}
}
if (config->home == NULL) {
err = config_init_home(config);
if (_Py_INIT_FAILED(err)) {
@ -1593,10 +1419,6 @@ _PyCoreConfig_Read(_PyCoreConfig *config, const _PyPreConfig *preconfig)
}
}
if (config->coerce_c_locale != 0 || config->utf8_mode < 0) {
config_init_locale(config);
}
if (config->_install_importlib) {
err = _PyCoreConfig_InitPathConfig(config);
if (_Py_INIT_FAILED(err)) {
@ -1623,12 +1445,6 @@ _PyCoreConfig_Read(_PyCoreConfig *config, const _PyPreConfig *preconfig)
if (config->tracemalloc < 0) {
config->tracemalloc = 0;
}
if (config->coerce_c_locale < 0) {
config->coerce_c_locale = 0;
}
if (config->utf8_mode < 0) {
config->utf8_mode = 0;
}
if (config->argc < 0) {
config->argc = 0;
}
@ -1645,7 +1461,6 @@ _PyCoreConfig_Read(_PyCoreConfig *config, const _PyPreConfig *preconfig)
return err;
}
assert(config->coerce_c_locale >= 0);
assert(config->preconfig.use_environment >= 0);
assert(config->filesystem_encoding != NULL);
assert(config->filesystem_errors != NULL);
@ -1703,9 +1518,6 @@ config_init_stdio(const _PyCoreConfig *config)
void
_PyCoreConfig_Write(const _PyCoreConfig *config)
{
if (config->coerce_c_locale) {
_Py_CoerceLegacyLocale(config->coerce_c_locale_warn);
}
_PyCoreConfig_SetGlobalConfig(config);
config_init_stdio(config);
}
@ -1769,11 +1581,8 @@ _PyCoreConfig_AsDict(const _PyCoreConfig *config)
SET_ITEM_INT(show_alloc_count);
SET_ITEM_INT(dump_refs);
SET_ITEM_INT(malloc_stats);
SET_ITEM_INT(coerce_c_locale);
SET_ITEM_INT(coerce_c_locale_warn);
SET_ITEM_STR(filesystem_encoding);
SET_ITEM_STR(filesystem_errors);
SET_ITEM_INT(utf8_mode);
SET_ITEM_WSTR(pycache_prefix);
SET_ITEM_WSTR(program_name);
SET_ITEM_WSTRLIST(argc, argv);
@ -1805,7 +1614,6 @@ _PyCoreConfig_AsDict(const _PyCoreConfig *config)
SET_ITEM_STR(stdio_encoding);
SET_ITEM_STR(stdio_errors);
#ifdef MS_WINDOWS
SET_ITEM_INT(legacy_windows_fs_encoding);
SET_ITEM_INT(legacy_windows_stdio);
#endif
SET_ITEM_INT(skip_source_first_line);
@ -2318,8 +2126,16 @@ config_from_cmdline(_PyCoreConfig *config, _PyCmdline *cmdline,
}
static _PyInitError
config_read_from_argv_impl(_PyCoreConfig *config, const _PyArgv *args,
/* Read the configuration into _PyCoreConfig and initialize the LC_CTYPE
locale: enable UTF-8 mode (PEP 540) and/or coerce the C locale (PEP 538).
Read the configuration from:
* Command line arguments
* Environment variables
* Py_xxx global configuration variables */
_PyInitError
_PyCoreConfig_ReadFromArgv(_PyCoreConfig *config, const _PyArgv *args,
const _PyPreConfig *preconfig)
{
_PyInitError err;
@ -2343,133 +2159,3 @@ config_read_from_argv_impl(_PyCoreConfig *config, const _PyArgv *args,
cmdline_clear(&cmdline);
return err;
}
/* Read the configuration into _PyCoreConfig and initialize the LC_CTYPE
locale: enable UTF-8 mode (PEP 540) and/or coerce the C locale (PEP 538).
Read the configuration from:
* Command line arguments
* Environment variables
* Py_xxx global configuration variables */
_PyInitError
_PyCoreConfig_ReadFromArgv(_PyCoreConfig *config, const _PyArgv *args,
const _PyPreConfig *preconfig)
{
_PyInitError err;
int init_utf8_mode = Py_UTF8Mode;
#ifdef MS_WINDOWS
int init_legacy_encoding = Py_LegacyWindowsFSEncodingFlag;
#endif
_PyCoreConfig save_config = _PyCoreConfig_INIT;
int locale_coerced = 0;
int loops = 0;
char *init_ctype_locale = NULL;
/* copy LC_CTYPE locale */
const char *loc = setlocale(LC_CTYPE, NULL);
if (loc == NULL) {
err = _Py_INIT_ERR("failed to LC_CTYPE locale");
goto done;
}
init_ctype_locale = _PyMem_RawStrdup(loc);
if (init_ctype_locale == NULL) {
err = _Py_INIT_NO_MEMORY();
goto done;
}
if (_PyCoreConfig_Copy(&save_config, config) < 0) {
err = _Py_INIT_NO_MEMORY();
goto done;
}
/* Set LC_CTYPE to the user preferred locale */
_Py_SetLocaleFromEnv(LC_CTYPE);
while (1) {
int utf8_mode = config->utf8_mode;
int encoding_changed = 0;
/* Watchdog to prevent an infinite loop */
loops++;
if (loops == 3) {
err = _Py_INIT_ERR("Encoding changed twice while "
"reading the configuration");
goto done;
}
/* bpo-34207: Py_DecodeLocale() and Py_EncodeLocale() depend
on Py_UTF8Mode and Py_LegacyWindowsFSEncodingFlag. */
Py_UTF8Mode = config->utf8_mode;
#ifdef MS_WINDOWS
Py_LegacyWindowsFSEncodingFlag = config->legacy_windows_fs_encoding;
#endif
err = config_read_from_argv_impl(config, args, preconfig);
if (_Py_INIT_FAILED(err)) {
goto done;
}
if (locale_coerced) {
config->coerce_c_locale = 1;
}
/* The legacy C locale assumes ASCII as the default text encoding, which
* causes problems not only for the CPython runtime, but also other
* components like GNU readline.
*
* Accordingly, when the CLI detects it, it attempts to coerce it to a
* more capable UTF-8 based alternative.
*
* See the documentation of the PYTHONCOERCECLOCALE setting for more
* details.
*/
if (config->coerce_c_locale && !locale_coerced) {
locale_coerced = 1;
_Py_CoerceLegacyLocale(0);
encoding_changed = 1;
}
if (utf8_mode == -1) {
if (config->utf8_mode == 1) {
/* UTF-8 Mode enabled */
encoding_changed = 1;
}
}
else {
if (config->utf8_mode != utf8_mode) {
encoding_changed = 1;
}
}
if (!encoding_changed) {
break;
}
/* Reset the configuration before reading again the configuration,
just keep UTF-8 Mode value. */
int new_utf8_mode = config->utf8_mode;
int new_coerce_c_locale = config->coerce_c_locale;
if (_PyCoreConfig_Copy(config, &save_config) < 0) {
err = _Py_INIT_NO_MEMORY();
goto done;
}
config->utf8_mode = new_utf8_mode;
config->coerce_c_locale = new_coerce_c_locale;
/* The encoding changed: read again the configuration
with the new encoding */
}
err = _Py_INIT_OK();
done:
if (init_ctype_locale != NULL) {
setlocale(LC_CTYPE, init_ctype_locale);
}
_PyCoreConfig_Clear(&save_config);
Py_UTF8Mode = init_utf8_mode ;
#ifdef MS_WINDOWS
Py_LegacyWindowsFSEncodingFlag = init_legacy_encoding;
#endif
return err;
}

View File

@ -1,6 +1,8 @@
#include "Python.h"
#include "pycore_coreconfig.h"
#include "pycore_getopt.h"
#include "pycore_pystate.h" /* _PyRuntime_Initialize() */
#include <locale.h> /* setlocale() */
#define DECODE_LOCALE_ERR(NAME, LEN) \
@ -99,6 +101,8 @@ typedef struct {
const _PyArgv *args;
int argc;
wchar_t **argv;
int nxoption; /* Number of -X options */
wchar_t **xoptions; /* -X options */
} _PyPreCmdline;
@ -109,6 +113,10 @@ precmdline_clear(_PyPreCmdline *cmdline)
_Py_wstrlist_clear(cmdline->args->argc, cmdline->argv);
}
cmdline->argv = NULL;
_Py_wstrlist_clear(cmdline->nxoption, cmdline->xoptions);
cmdline->nxoption = 0;
cmdline->xoptions = NULL;
}
@ -129,6 +137,12 @@ _PyPreConfig_Copy(_PyPreConfig *config, const _PyPreConfig *config2)
COPY_ATTR(isolated);
COPY_ATTR(use_environment);
COPY_ATTR(coerce_c_locale);
COPY_ATTR(coerce_c_locale_warn);
#ifdef MS_WINDOWS
COPY_ATTR(legacy_windows_fs_encoding);
#endif
COPY_ATTR(utf8_mode);
#undef COPY_ATTR
return 0;
@ -149,6 +163,10 @@ _PyPreConfig_GetGlobalConfig(_PyPreConfig *config)
COPY_FLAG(isolated, Py_IsolatedFlag);
COPY_NOT_FLAG(use_environment, Py_IgnoreEnvironmentFlag);
#ifdef MS_WINDOWS
COPY_FLAG(legacy_windows_fs_encoding, Py_LegacyWindowsFSEncodingFlag);
#endif
COPY_FLAG(utf8_mode, Py_UTF8Mode);
#undef COPY_FLAG
#undef COPY_NOT_FLAG
@ -169,14 +187,161 @@ _PyPreConfig_SetGlobalConfig(const _PyPreConfig *config)
COPY_FLAG(isolated, Py_IsolatedFlag);
COPY_NOT_FLAG(use_environment, Py_IgnoreEnvironmentFlag);
#ifdef MS_WINDOWS
COPY_FLAG(legacy_windows_fs_encoding, Py_LegacyWindowsFSEncodingFlag);
#endif
COPY_FLAG(utf8_mode, Py_UTF8Mode);
#undef COPY_FLAG
#undef COPY_NOT_FLAG
}
_PyInitError
_PyPreConfig_Read(_PyPreConfig *config)
const char*
_PyPreConfig_GetEnv(const _PyPreConfig *config, const char *name)
{
assert(config->use_environment >= 0);
if (!config->use_environment) {
return NULL;
}
const char *var = getenv(name);
if (var && var[0] != '\0') {
return var;
}
else {
return NULL;
}
}
int
_Py_str_to_int(const char *str, int *result)
{
const char *endptr = str;
errno = 0;
long value = strtol(str, (char **)&endptr, 10);
if (*endptr != '\0' || errno == ERANGE) {
return -1;
}
if (value < INT_MIN || value > INT_MAX) {
return -1;
}
*result = (int)value;
return 0;
}
void
_Py_get_env_flag(_PyPreConfig *config, int *flag, const char *name)
{
const char *var = _PyPreConfig_GetEnv(config, name);
if (!var) {
return;
}
int value;
if (_Py_str_to_int(var, &value) < 0 || value < 0) {
/* PYTHONDEBUG=text and PYTHONDEBUG=-2 behave as PYTHONDEBUG=1 */
value = 1;
}
if (*flag < value) {
*flag = value;
}
}
const wchar_t*
_Py_get_xoption(int nxoption, wchar_t * const *xoptions, const wchar_t *name)
{
for (int i=0; i < nxoption; i++) {
const wchar_t *option = xoptions[i];
size_t len;
wchar_t *sep = wcschr(option, L'=');
if (sep != NULL) {
len = (sep - option);
}
else {
len = wcslen(option);
}
if (wcsncmp(option, name, len) == 0 && name[len] == L'\0') {
return option;
}
}
return NULL;
}
static _PyInitError
preconfig_init_utf8_mode(_PyPreConfig *config, const _PyPreCmdline *cmdline)
{
const wchar_t *xopt;
if (cmdline) {
xopt = _Py_get_xoption(cmdline->nxoption, cmdline->xoptions, L"utf8");
}
else {
xopt = NULL;
}
if (xopt) {
wchar_t *sep = wcschr(xopt, L'=');
if (sep) {
xopt = sep + 1;
if (wcscmp(xopt, L"1") == 0) {
config->utf8_mode = 1;
}
else if (wcscmp(xopt, L"0") == 0) {
config->utf8_mode = 0;
}
else {
return _Py_INIT_USER_ERR("invalid -X utf8 option value");
}
}
else {
config->utf8_mode = 1;
}
return _Py_INIT_OK();
}
const char *opt = _PyPreConfig_GetEnv(config, "PYTHONUTF8");
if (opt) {
if (strcmp(opt, "1") == 0) {
config->utf8_mode = 1;
}
else if (strcmp(opt, "0") == 0) {
config->utf8_mode = 0;
}
else {
return _Py_INIT_USER_ERR("invalid PYTHONUTF8 environment "
"variable value");
}
return _Py_INIT_OK();
}
return _Py_INIT_OK();
}
static void
preconfig_init_locale(_PyPreConfig *config)
{
/* Test also if coerce_c_locale equals 1: PYTHONCOERCECLOCALE=1 doesn't
imply that the C locale is always coerced. It is only coerced if
if the LC_CTYPE locale is "C". */
if (config->coerce_c_locale != 0) {
/* The C locale enables the C locale coercion (PEP 538) */
if (_Py_LegacyLocaleDetected()) {
config->coerce_c_locale = 1;
}
else {
config->coerce_c_locale = 0;
}
}
}
static _PyInitError
preconfig_read(_PyPreConfig *config, const _PyPreCmdline *cmdline)
{
_PyPreConfig_GetGlobalConfig(config);
@ -189,6 +354,69 @@ _PyPreConfig_Read(_PyPreConfig *config)
config->use_environment = 0;
}
if (config->use_environment) {
#ifdef MS_WINDOWS
_Py_get_env_flag(config, &config->legacy_windows_fs_encoding,
"PYTHONLEGACYWINDOWSFSENCODING");
#endif
const char *env = _PyPreConfig_GetEnv(config, "PYTHONCOERCECLOCALE");
if (env) {
if (strcmp(env, "0") == 0) {
if (config->coerce_c_locale < 0) {
config->coerce_c_locale = 0;
}
}
else if (strcmp(env, "warn") == 0) {
config->coerce_c_locale_warn = 1;
}
else {
if (config->coerce_c_locale < 0) {
config->coerce_c_locale = 1;
}
}
}
}
#ifdef MS_WINDOWS
if (config->legacy_windows_fs_encoding) {
config->utf8_mode = 0;
}
#endif
if (config->utf8_mode < 0) {
_PyInitError err = preconfig_init_utf8_mode(config, cmdline);
if (_Py_INIT_FAILED(err)) {
return err;
}
}
if (config->coerce_c_locale != 0) {
preconfig_init_locale(config);
}
#ifndef MS_WINDOWS
if (config->utf8_mode < 0) {
/* The C locale and the POSIX locale enable the UTF-8 Mode (PEP 540) */
const char *ctype_loc = setlocale(LC_CTYPE, NULL);
if (ctype_loc != NULL
&& (strcmp(ctype_loc, "C") == 0
|| strcmp(ctype_loc, "POSIX") == 0))
{
config->utf8_mode = 1;
}
}
#endif
if (config->coerce_c_locale < 0) {
config->coerce_c_locale = 0;
}
if (config->utf8_mode < 0) {
config->utf8_mode = 0;
}
assert(config->coerce_c_locale >= 0);
assert(config->utf8_mode >= 0);
assert(config->isolated >= 0);
assert(config->use_environment >= 0);
@ -196,6 +424,13 @@ _PyPreConfig_Read(_PyPreConfig *config)
}
_PyInitError
_PyPreConfig_Read(_PyPreConfig *config)
{
return preconfig_read(config, NULL);
}
int
_PyPreConfig_AsDict(const _PyPreConfig *config, PyObject *dict)
{
@ -216,6 +451,12 @@ _PyPreConfig_AsDict(const _PyPreConfig *config, PyObject *dict)
SET_ITEM_INT(isolated);
SET_ITEM_INT(use_environment);
SET_ITEM_INT(coerce_c_locale);
SET_ITEM_INT(coerce_c_locale_warn);
SET_ITEM_INT(utf8_mode);
#ifdef MS_WINDOWS
SET_ITEM_INT(legacy_windows_fs_encoding);
#endif
return 0;
fail:
@ -251,6 +492,18 @@ preconfig_parse_cmdline(_PyPreConfig *config, _PyPreCmdline *cmdline)
config->isolated++;
break;
case 'X':
{
_PyInitError err;
err = _Py_wstrlist_append(&cmdline->nxoption,
&cmdline->xoptions,
_PyOS_optarg);
if (_Py_INIT_FAILED(err)) {
return err;
}
break;
}
default:
/* ignore other argument:
handled by _PyCoreConfig_ReadFromArgv() */
@ -262,8 +515,8 @@ preconfig_parse_cmdline(_PyPreConfig *config, _PyPreCmdline *cmdline)
}
_PyInitError
_PyPreConfig_ReadFromArgv(_PyPreConfig *config, const _PyArgv *args)
static _PyInitError
preconfig_from_argv(_PyPreConfig *config, const _PyArgv *args)
{
_PyInitError err;
@ -281,7 +534,7 @@ _PyPreConfig_ReadFromArgv(_PyPreConfig *config, const _PyArgv *args)
goto done;
}
err = _PyPreConfig_Read(config);
err = preconfig_read(config, &cmdline);
if (_Py_INIT_FAILED(err)) {
goto done;
}
@ -293,7 +546,144 @@ _PyPreConfig_ReadFromArgv(_PyPreConfig *config, const _PyArgv *args)
}
/* Read the preconfiguration. */
_PyInitError
_PyPreConfig_ReadFromArgv(_PyPreConfig *config, const _PyArgv *args)
{
_PyInitError err;
err = _PyRuntime_Initialize();
if (_Py_INIT_FAILED(err)) {
return err;
}
char *init_ctype_locale = NULL;
int init_utf8_mode = Py_UTF8Mode;
#ifdef MS_WINDOWS
int init_legacy_encoding = Py_LegacyWindowsFSEncodingFlag;
#endif
_PyPreConfig save_config = _PyPreConfig_INIT;
int locale_coerced = 0;
int loops = 0;
/* copy LC_CTYPE locale */
const char *loc = setlocale(LC_CTYPE, NULL);
if (loc == NULL) {
err = _Py_INIT_ERR("failed to LC_CTYPE locale");
goto done;
}
init_ctype_locale = _PyMem_RawStrdup(loc);
if (init_ctype_locale == NULL) {
err = _Py_INIT_NO_MEMORY();
goto done;
}
if (_PyPreConfig_Copy(&save_config, config) < 0) {
err = _Py_INIT_NO_MEMORY();
goto done;
}
/* Set LC_CTYPE to the user preferred locale */
_Py_SetLocaleFromEnv(LC_CTYPE);
while (1) {
int utf8_mode = config->utf8_mode;
/* Watchdog to prevent an infinite loop */
loops++;
if (loops == 3) {
err = _Py_INIT_ERR("Encoding changed twice while "
"reading the configuration");
goto done;
}
/* bpo-34207: Py_DecodeLocale() and Py_EncodeLocale() depend
on Py_UTF8Mode and Py_LegacyWindowsFSEncodingFlag. */
Py_UTF8Mode = config->utf8_mode;
#ifdef MS_WINDOWS
Py_LegacyWindowsFSEncodingFlag = config->legacy_windows_fs_encoding;
#endif
err = preconfig_from_argv(config, args);
if (_Py_INIT_FAILED(err)) {
goto done;
}
if (locale_coerced) {
config->coerce_c_locale = 1;
}
/* The legacy C locale assumes ASCII as the default text encoding, which
* causes problems not only for the CPython runtime, but also other
* components like GNU readline.
*
* Accordingly, when the CLI detects it, it attempts to coerce it to a
* more capable UTF-8 based alternative.
*
* See the documentation of the PYTHONCOERCECLOCALE setting for more
* details.
*/
int encoding_changed = 0;
if (config->coerce_c_locale && !locale_coerced) {
locale_coerced = 1;
_Py_CoerceLegacyLocale(0);
encoding_changed = 1;
}
if (utf8_mode == -1) {
if (config->utf8_mode == 1) {
/* UTF-8 Mode enabled */
encoding_changed = 1;
}
}
else {
if (config->utf8_mode != utf8_mode) {
encoding_changed = 1;
}
}
if (!encoding_changed) {
break;
}
/* Reset the configuration before reading again the configuration,
just keep UTF-8 Mode value. */
int new_utf8_mode = config->utf8_mode;
int new_coerce_c_locale = config->coerce_c_locale;
if (_PyPreConfig_Copy(config, &save_config) < 0) {
err = _Py_INIT_NO_MEMORY();
goto done;
}
config->utf8_mode = new_utf8_mode;
config->coerce_c_locale = new_coerce_c_locale;
/* The encoding changed: read again the configuration
with the new encoding */
}
err = _Py_INIT_OK();
done:
if (init_ctype_locale != NULL) {
setlocale(LC_CTYPE, init_ctype_locale);
}
_PyPreConfig_Clear(&save_config);
Py_UTF8Mode = init_utf8_mode ;
#ifdef MS_WINDOWS
Py_LegacyWindowsFSEncodingFlag = init_legacy_encoding;
#endif
return err;
}
void
_PyPreConfig_Write(const _PyPreConfig *config)
{
_PyPreConfig_SetGlobalConfig(config);
if (config->coerce_c_locale) {
_Py_CoerceLegacyLocale(config->coerce_c_locale_warn);
}
/* Set LC_CTYPE to the user preferred locale */
_Py_SetLocaleFromEnv(LC_CTYPE);
}

View File

@ -287,7 +287,7 @@ static const char *_C_LOCALE_WARNING =
static void
_emit_stderr_warning_for_legacy_locale(const _PyCoreConfig *core_config)
{
if (core_config->coerce_c_locale_warn && _Py_LegacyLocaleDetected()) {
if (core_config->preconfig.coerce_c_locale_warn && _Py_LegacyLocaleDetected()) {
PySys_FormatStderr("%s", _C_LOCALE_WARNING);
}
}

View File

@ -2181,7 +2181,7 @@ make_flags(void)
SetFlag(config->use_hash_seed == 0 || config->hash_seed != 0);
SetFlag(config->preconfig.isolated);
PyStructSequence_SET_ITEM(seq, pos++, PyBool_FromLong(config->dev_mode));
SetFlag(config->utf8_mode);
SetFlag(config->preconfig.utf8_mode);
#undef SetFlag
if (PyErr_Occurred()) {