From 282012593510a285fec5b8b5e42b04fef3ffffe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lemburg?= Date: Fri, 16 May 2003 17:07:51 +0000 Subject: [PATCH] Remove usage of re module from encodings package search function. --- Lib/encodings/__init__.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index 66bea5c0684..666afad6b6e 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -27,12 +27,17 @@ """#" -import codecs, exceptions, re +import codecs, exceptions, types _cache = {} _unknown = '--unknown--' _import_tail = ['*'] -_norm_encoding_RE = re.compile('[^a-zA-Z0-9.]') +_norm_encoding_map = (' . ' + '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ ' + ' abcdefghijklmnopqrstuvwxyz ' + ' ' + ' ' + ' ') class CodecRegistryError(exceptions.LookupError, exceptions.SystemError): @@ -45,10 +50,20 @@ def normalize_encoding(encoding): Normalization works as follows: all non-alphanumeric characters except the dot used for Python package names are collapsed and replaced with a single underscore, e.g. ' -;#' - becomes '_'. + becomes '_'. Leading and trailing underscores are removed. + + Note that encoding names should be ASCII only; if they do use + non-ASCII characters, these must be Latin-1 compatible. """ - return '_'.join(_norm_encoding_RE.split(encoding)) + # Make sure we have an 8-bit string, because .translate() works + # differently for Unicode strings. + if type(encoding) is types.UnicodeType: + # Note that .encode('latin-1') does *not* use the codec + # registry, so this call doesn't recurse. (See unicodeobject.c + # PyUnicode_AsEncodedString() for details) + encoding = encoding.encode('latin-1') + return '_'.join(encoding.translate(_norm_encoding_map).split()) def search_function(encoding):