mirror of https://github.com/python/cpython.git
On 17-Mar-2000, Marc-Andre Lemburg said:
Attached you find an update of the Unicode implementation. The patch is against the current CVS version. I would appreciate if someone with CVS checkin permissions could check the changes in. The patch contains all bugs and patches sent this week and also fixes a leak in the codecs code and a bug in the free list code for Unicode objects (which only shows up when compiling Python with Py_DEBUG; thanks to MarkH for spotting this one).
This commit is contained in:
parent
abc411bac8
commit
51ac58039f
|
@ -1,8 +1,5 @@
|
|||
#ifndef Py_UNICODEOBJECT_H
|
||||
#define Py_UNICODEOBJECT_H
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
||||
|
@ -109,8 +106,9 @@ typedef unsigned short Py_UNICODE;
|
|||
/* --- Internal Unicode Operations ---------------------------------------- */
|
||||
|
||||
/* If you want Python to use the compiler's wctype.h functions instead
|
||||
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS.
|
||||
This reduces the interpreter's code size. */
|
||||
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
|
||||
configure Python using --with-ctype-functions. This reduces the
|
||||
interpreter's code size. */
|
||||
|
||||
#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
|
||||
|
||||
|
@ -169,6 +167,10 @@ typedef unsigned short Py_UNICODE;
|
|||
(!memcmp((string)->str + (offset), (substring)->str,\
|
||||
(substring)->length*sizeof(Py_UNICODE)))
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* --- Unicode Type ------------------------------------------------------- */
|
||||
|
||||
typedef struct {
|
||||
|
@ -647,7 +649,7 @@ extern DL_IMPORT(int) PyUnicode_Find(
|
|||
int direction /* Find direction: +1 forward, -1 backward */
|
||||
);
|
||||
|
||||
/* Count the number of occurances of substr in str[start:end]. */
|
||||
/* Count the number of occurrences of substr in str[start:end]. */
|
||||
|
||||
extern DL_IMPORT(int) PyUnicode_Count(
|
||||
PyObject *str, /* String */
|
||||
|
@ -656,7 +658,7 @@ extern DL_IMPORT(int) PyUnicode_Count(
|
|||
int end /* Stop index */
|
||||
);
|
||||
|
||||
/* Replace at most maxcount occurances of substr in str with replstr
|
||||
/* Replace at most maxcount occurrences of substr in str with replstr
|
||||
and return the resulting Unicode object. */
|
||||
|
||||
extern DL_IMPORT(PyObject *) PyUnicode_Replace(
|
||||
|
|
|
@ -30,13 +30,13 @@
|
|||
import string,codecs,aliases
|
||||
|
||||
_cache = {}
|
||||
_unkown = '--unkown--'
|
||||
_unknown = '--unknown--'
|
||||
|
||||
def search_function(encoding):
|
||||
|
||||
# Cache lookup
|
||||
entry = _cache.get(encoding,_unkown)
|
||||
if entry is not _unkown:
|
||||
entry = _cache.get(encoding,_unknown)
|
||||
if entry is not _unknown:
|
||||
return entry
|
||||
|
||||
# Import the module
|
||||
|
|
|
@ -143,6 +143,7 @@ def __init__(self): self.seq = [7, 'hello', 123L]
|
|||
test('translate', 'xyz', 'xyz', table)
|
||||
|
||||
test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1)
|
||||
test('replace', 'one!two!three!', 'onetwothree', '!', '')
|
||||
test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2)
|
||||
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3)
|
||||
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4)
|
||||
|
|
|
@ -108,6 +108,7 @@ def __init__(self): self.seq = [7, u'hello', 123L]
|
|||
test('translate', u'xyz', u'xyz', table)
|
||||
|
||||
test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
|
||||
test('replace', u'one!two!three!', u'onetwothree', '!', '')
|
||||
test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
|
||||
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
|
||||
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)
|
||||
|
|
|
@ -743,8 +743,9 @@ For explicit handling of files using Unicode, the standard
|
|||
stream codecs as available through the codecs module should
|
||||
be used.
|
||||
|
||||
XXX There should be a short-cut open(filename,mode,encoding) available which
|
||||
also assures that mode contains the 'b' character when needed.
|
||||
The codecs module should provide a short-cut open(filename,mode,encoding)
|
||||
available which also assures that mode contains the 'b' character when
|
||||
needed.
|
||||
|
||||
|
||||
File/Stream Input:
|
||||
|
@ -810,6 +811,10 @@ Unicode-Mappings:
|
|||
Introduction to Unicode (a little outdated by still nice to read):
|
||||
http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
|
||||
|
||||
For comparison:
|
||||
Introducing Unicode to ECMAScript --
|
||||
http://www-4.ibm.com/software/developer/library/internationalization-support.html
|
||||
|
||||
Encodings:
|
||||
|
||||
Overview:
|
||||
|
@ -832,7 +837,7 @@ Encodings:
|
|||
|
||||
History of this Proposal:
|
||||
-------------------------
|
||||
1.2:
|
||||
1.2: Removed POD about codecs.open()
|
||||
1.1: Added note about comparisons and hash values. Added note about
|
||||
case mapping algorithms. Changed stream codecs .read() and
|
||||
.write() method to match the standard file-like object methods
|
||||
|
|
|
@ -1054,7 +1054,7 @@ strop_translate(self, args)
|
|||
|
||||
strstr replacement for arbitrary blocks of memory.
|
||||
|
||||
Locates the first occurance in the memory pointed to by MEM of the
|
||||
Locates the first occurrence in the memory pointed to by MEM of the
|
||||
contents of memory pointed to by PAT. Returns the index into MEM if
|
||||
found, or -1 if not found. If len of PAT is greater than length of
|
||||
MEM, the function returns -1.
|
||||
|
|
|
@ -1395,7 +1395,7 @@ string_translate(self, args)
|
|||
|
||||
strstr replacement for arbitrary blocks of memory.
|
||||
|
||||
Locates the first occurance in the memory pointed to by MEM of the
|
||||
Locates the first occurrence in the memory pointed to by MEM of the
|
||||
contents of memory pointed to by PAT. Returns the index into MEM if
|
||||
found, or -1 if not found. If len of PAT is greater than length of
|
||||
MEM, the function returns -1.
|
||||
|
@ -1578,7 +1578,7 @@ string_replace(self, args)
|
|||
return NULL;
|
||||
|
||||
if (sub_len <= 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty replacement string");
|
||||
PyErr_SetString(PyExc_ValueError, "empty pattern string");
|
||||
return NULL;
|
||||
}
|
||||
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);
|
||||
|
|
|
@ -83,7 +83,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
|
|||
all objects on the free list having a size less than this
|
||||
limit. This reduces malloc() overhead for small Unicode objects.
|
||||
|
||||
At worse this will result in MAX_UNICODE_FREELIST_SIZE *
|
||||
At worst this will result in MAX_UNICODE_FREELIST_SIZE *
|
||||
(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
|
||||
malloc()-overhead) bytes of unused garbage.
|
||||
|
||||
|
@ -180,7 +180,7 @@ PyUnicodeObject *_PyUnicode_New(int length)
|
|||
unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
|
||||
unicode_freelist_size--;
|
||||
unicode->ob_type = &PyUnicode_Type;
|
||||
_Py_NewReference(unicode);
|
||||
_Py_NewReference((PyObject *)unicode);
|
||||
if (unicode->str) {
|
||||
if (unicode->length < length &&
|
||||
_PyUnicode_Resize(unicode, length)) {
|
||||
|
@ -199,16 +199,19 @@ PyUnicodeObject *_PyUnicode_New(int length)
|
|||
unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
|
||||
}
|
||||
|
||||
if (!unicode->str) {
|
||||
PyMem_DEL(unicode);
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
if (!unicode->str)
|
||||
goto onError;
|
||||
unicode->str[length] = 0;
|
||||
unicode->length = length;
|
||||
unicode->hash = -1;
|
||||
unicode->utf8str = NULL;
|
||||
return unicode;
|
||||
|
||||
onError:
|
||||
_Py_ForgetReference((PyObject *)unicode);
|
||||
PyMem_DEL(unicode);
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static
|
||||
|
@ -224,7 +227,6 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
|
|||
*(PyUnicodeObject **)unicode = unicode_freelist;
|
||||
unicode_freelist = unicode;
|
||||
unicode_freelist_size++;
|
||||
_Py_ForgetReference(unicode);
|
||||
}
|
||||
else {
|
||||
free(unicode->str);
|
||||
|
@ -489,7 +491,7 @@ int utf8_decoding_error(const char **source,
|
|||
}
|
||||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"UTF-8 decoding error; unkown error handling code: %s",
|
||||
"UTF-8 decoding error; unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -611,7 +613,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"UTF-8 encoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -733,7 +735,7 @@ int utf16_decoding_error(const Py_UNICODE **source,
|
|||
}
|
||||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"UTF-16 decoding error; unkown error handling code: %s",
|
||||
"UTF-16 decoding error; unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -921,7 +923,7 @@ int unicodeescape_decoding_error(const char **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"Unicode-Escape decoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1051,6 +1053,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
|||
|
||||
*/
|
||||
|
||||
static const Py_UNICODE *findchar(const Py_UNICODE *s,
|
||||
int size,
|
||||
Py_UNICODE ch);
|
||||
|
||||
static
|
||||
PyObject *unicodeescape_string(const Py_UNICODE *s,
|
||||
int size,
|
||||
|
@ -1069,9 +1075,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
|
|||
p = q = PyString_AS_STRING(repr);
|
||||
|
||||
if (quotes) {
|
||||
static const Py_UNICODE *findchar(const Py_UNICODE *s,
|
||||
int size,
|
||||
Py_UNICODE ch);
|
||||
*p++ = 'u';
|
||||
*p++ = (findchar(s, size, '\'') &&
|
||||
!findchar(s, size, '"')) ? '"' : '\'';
|
||||
|
@ -1298,7 +1301,7 @@ int latin1_encoding_error(const Py_UNICODE **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"Latin-1 encoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1369,7 +1372,7 @@ int ascii_decoding_error(const char **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"ASCII decoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1431,7 +1434,7 @@ int ascii_encoding_error(const Py_UNICODE **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"ASCII encoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1502,7 +1505,7 @@ int charmap_decoding_error(const char **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"charmap decoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1618,7 +1621,7 @@ int charmap_encoding_error(const Py_UNICODE **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"charmap encoding error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
@ -1750,7 +1753,7 @@ int translate_error(const Py_UNICODE **source,
|
|||
else {
|
||||
PyErr_Format(PyExc_ValueError,
|
||||
"translate error; "
|
||||
"unkown error handling code: %s",
|
||||
"unknown error handling code: %s",
|
||||
errors);
|
||||
return -1;
|
||||
}
|
||||
|
|
|
@ -93,9 +93,14 @@ PyObject *lowercasestring(const char *string)
|
|||
|
||||
PyObject *_PyCodec_Lookup(const char *encoding)
|
||||
{
|
||||
PyObject *result, *args = NULL, *v;
|
||||
PyObject *result, *args = NULL, *v = NULL;
|
||||
int i, len;
|
||||
|
||||
if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
|
||||
PyErr_SetString(PyExc_SystemError,
|
||||
"codec module not properly initialized");
|
||||
goto onError;
|
||||
}
|
||||
if (!import_encodings_called)
|
||||
import_encodings();
|
||||
|
||||
|
@ -109,6 +114,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
|
|||
result = PyDict_GetItem(_PyCodec_SearchCache, v);
|
||||
if (result != NULL) {
|
||||
Py_INCREF(result);
|
||||
Py_DECREF(v);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -121,6 +127,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
|
|||
if (args == NULL)
|
||||
goto onError;
|
||||
PyTuple_SET_ITEM(args,0,v);
|
||||
v = NULL;
|
||||
|
||||
for (i = 0; i < len; i++) {
|
||||
PyObject *func;
|
||||
|
@ -146,7 +153,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
|
|||
if (i == len) {
|
||||
/* XXX Perhaps we should cache misses too ? */
|
||||
PyErr_SetString(PyExc_LookupError,
|
||||
"unkown encoding");
|
||||
"unknown encoding");
|
||||
goto onError;
|
||||
}
|
||||
|
||||
|
@ -156,6 +163,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
|
|||
return result;
|
||||
|
||||
onError:
|
||||
Py_XDECREF(v);
|
||||
Py_XDECREF(args);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -378,5 +386,7 @@ void _PyCodecRegistry_Init()
|
|||
void _PyCodecRegistry_Fini()
|
||||
{
|
||||
Py_XDECREF(_PyCodec_SearchPath);
|
||||
_PyCodec_SearchPath = NULL;
|
||||
Py_XDECREF(_PyCodec_SearchCache);
|
||||
_PyCodec_SearchCache = NULL;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue