On 17-Mar-2000, Marc-Andre Lemburg said:

Attached you find an update of the Unicode implementation.

    The patch is against the current CVS version. I would appreciate
    if someone with CVS checkin permissions could check the changes
    in.

    The patch contains all bugs and patches sent this week and also
    fixes a leak in the codecs code and a bug in the free list code
    for Unicode objects (which only shows up when compiling Python
    with Py_DEBUG; thanks to MarkH for spotting this one).
This commit is contained in:
Barry Warsaw 2000-03-20 16:36:48 +00:00
parent abc411bac8
commit 51ac58039f
9 changed files with 61 additions and 39 deletions

View File

@ -1,8 +1,5 @@
#ifndef Py_UNICODEOBJECT_H
#define Py_UNICODEOBJECT_H
#ifdef __cplusplus
extern "C" {
#endif
/*
@ -109,8 +106,9 @@ typedef unsigned short Py_UNICODE;
/* --- Internal Unicode Operations ---------------------------------------- */
/* If you want Python to use the compiler's wctype.h functions instead
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS.
This reduces the interpreter's code size. */
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
configure Python using --with-ctype-functions. This reduces the
interpreter's code size. */
#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
@ -169,6 +167,10 @@ typedef unsigned short Py_UNICODE;
(!memcmp((string)->str + (offset), (substring)->str,\
(substring)->length*sizeof(Py_UNICODE)))
#ifdef __cplusplus
extern "C" {
#endif
/* --- Unicode Type ------------------------------------------------------- */
typedef struct {
@ -647,7 +649,7 @@ extern DL_IMPORT(int) PyUnicode_Find(
int direction /* Find direction: +1 forward, -1 backward */
);
/* Count the number of occurances of substr in str[start:end]. */
/* Count the number of occurrences of substr in str[start:end]. */
extern DL_IMPORT(int) PyUnicode_Count(
PyObject *str, /* String */
@ -656,7 +658,7 @@ extern DL_IMPORT(int) PyUnicode_Count(
int end /* Stop index */
);
/* Replace at most maxcount occurances of substr in str with replstr
/* Replace at most maxcount occurrences of substr in str with replstr
and return the resulting Unicode object. */
extern DL_IMPORT(PyObject *) PyUnicode_Replace(

View File

@ -30,13 +30,13 @@
import string,codecs,aliases
_cache = {}
_unkown = '--unkown--'
_unknown = '--unknown--'
def search_function(encoding):
# Cache lookup
entry = _cache.get(encoding,_unkown)
if entry is not _unkown:
entry = _cache.get(encoding,_unknown)
if entry is not _unknown:
return entry
# Import the module

View File

@ -143,6 +143,7 @@ def __init__(self): self.seq = [7, 'hello', 123L]
test('translate', 'xyz', 'xyz', table)
test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1)
test('replace', 'one!two!three!', 'onetwothree', '!', '')
test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2)
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3)
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4)

View File

@ -108,6 +108,7 @@ def __init__(self): self.seq = [7, u'hello', 123L]
test('translate', u'xyz', u'xyz', table)
test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
test('replace', u'one!two!three!', u'onetwothree', '!', '')
test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)

View File

@ -743,8 +743,9 @@ For explicit handling of files using Unicode, the standard
stream codecs as available through the codecs module should
be used.
XXX There should be a short-cut open(filename,mode,encoding) available which
also assures that mode contains the 'b' character when needed.
The codecs module should provide a short-cut open(filename,mode,encoding)
available which also assures that mode contains the 'b' character when
needed.
File/Stream Input:
@ -810,6 +811,10 @@ Unicode-Mappings:
Introduction to Unicode (a little outdated by still nice to read):
http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
For comparison:
Introducing Unicode to ECMAScript --
http://www-4.ibm.com/software/developer/library/internationalization-support.html
Encodings:
Overview:
@ -832,7 +837,7 @@ Encodings:
History of this Proposal:
-------------------------
1.2:
1.2: Removed POD about codecs.open()
1.1: Added note about comparisons and hash values. Added note about
case mapping algorithms. Changed stream codecs .read() and
.write() method to match the standard file-like object methods

View File

@ -1054,7 +1054,7 @@ strop_translate(self, args)
strstr replacement for arbitrary blocks of memory.
Locates the first occurance in the memory pointed to by MEM of the
Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1.

View File

@ -1395,7 +1395,7 @@ string_translate(self, args)
strstr replacement for arbitrary blocks of memory.
Locates the first occurance in the memory pointed to by MEM of the
Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1.
@ -1578,7 +1578,7 @@ string_replace(self, args)
return NULL;
if (sub_len <= 0) {
PyErr_SetString(PyExc_ValueError, "empty replacement string");
PyErr_SetString(PyExc_ValueError, "empty pattern string");
return NULL;
}
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);

View File

@ -83,7 +83,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
all objects on the free list having a size less than this
limit. This reduces malloc() overhead for small Unicode objects.
At worse this will result in MAX_UNICODE_FREELIST_SIZE *
At worst this will result in MAX_UNICODE_FREELIST_SIZE *
(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
malloc()-overhead) bytes of unused garbage.
@ -180,7 +180,7 @@ PyUnicodeObject *_PyUnicode_New(int length)
unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
unicode_freelist_size--;
unicode->ob_type = &PyUnicode_Type;
_Py_NewReference(unicode);
_Py_NewReference((PyObject *)unicode);
if (unicode->str) {
if (unicode->length < length &&
_PyUnicode_Resize(unicode, length)) {
@ -199,16 +199,19 @@ PyUnicodeObject *_PyUnicode_New(int length)
unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
}
if (!unicode->str) {
PyMem_DEL(unicode);
PyErr_NoMemory();
return NULL;
}
if (!unicode->str)
goto onError;
unicode->str[length] = 0;
unicode->length = length;
unicode->hash = -1;
unicode->utf8str = NULL;
return unicode;
onError:
_Py_ForgetReference((PyObject *)unicode);
PyMem_DEL(unicode);
PyErr_NoMemory();
return NULL;
}
static
@ -224,7 +227,6 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
*(PyUnicodeObject **)unicode = unicode_freelist;
unicode_freelist = unicode;
unicode_freelist_size++;
_Py_ForgetReference(unicode);
}
else {
free(unicode->str);
@ -489,7 +491,7 @@ int utf8_decoding_error(const char **source,
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-8 decoding error; unkown error handling code: %s",
"UTF-8 decoding error; unknown error handling code: %s",
errors);
return -1;
}
@ -611,7 +613,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"UTF-8 encoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -733,7 +735,7 @@ int utf16_decoding_error(const Py_UNICODE **source,
}
else {
PyErr_Format(PyExc_ValueError,
"UTF-16 decoding error; unkown error handling code: %s",
"UTF-16 decoding error; unknown error handling code: %s",
errors);
return -1;
}
@ -921,7 +923,7 @@ int unicodeescape_decoding_error(const char **source,
else {
PyErr_Format(PyExc_ValueError,
"Unicode-Escape decoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -1051,6 +1053,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*/
static const Py_UNICODE *findchar(const Py_UNICODE *s,
int size,
Py_UNICODE ch);
static
PyObject *unicodeescape_string(const Py_UNICODE *s,
int size,
@ -1069,9 +1075,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
p = q = PyString_AS_STRING(repr);
if (quotes) {
static const Py_UNICODE *findchar(const Py_UNICODE *s,
int size,
Py_UNICODE ch);
*p++ = 'u';
*p++ = (findchar(s, size, '\'') &&
!findchar(s, size, '"')) ? '"' : '\'';
@ -1298,7 +1301,7 @@ int latin1_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"Latin-1 encoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -1369,7 +1372,7 @@ int ascii_decoding_error(const char **source,
else {
PyErr_Format(PyExc_ValueError,
"ASCII decoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -1431,7 +1434,7 @@ int ascii_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"ASCII encoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -1502,7 +1505,7 @@ int charmap_decoding_error(const char **source,
else {
PyErr_Format(PyExc_ValueError,
"charmap decoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -1618,7 +1621,7 @@ int charmap_encoding_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"charmap encoding error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}
@ -1750,7 +1753,7 @@ int translate_error(const Py_UNICODE **source,
else {
PyErr_Format(PyExc_ValueError,
"translate error; "
"unkown error handling code: %s",
"unknown error handling code: %s",
errors);
return -1;
}

View File

@ -93,9 +93,14 @@ PyObject *lowercasestring(const char *string)
PyObject *_PyCodec_Lookup(const char *encoding)
{
PyObject *result, *args = NULL, *v;
PyObject *result, *args = NULL, *v = NULL;
int i, len;
if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
PyErr_SetString(PyExc_SystemError,
"codec module not properly initialized");
goto onError;
}
if (!import_encodings_called)
import_encodings();
@ -109,6 +114,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
result = PyDict_GetItem(_PyCodec_SearchCache, v);
if (result != NULL) {
Py_INCREF(result);
Py_DECREF(v);
return result;
}
@ -121,6 +127,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
if (args == NULL)
goto onError;
PyTuple_SET_ITEM(args,0,v);
v = NULL;
for (i = 0; i < len; i++) {
PyObject *func;
@ -146,7 +153,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
if (i == len) {
/* XXX Perhaps we should cache misses too ? */
PyErr_SetString(PyExc_LookupError,
"unkown encoding");
"unknown encoding");
goto onError;
}
@ -156,6 +163,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
return result;
onError:
Py_XDECREF(v);
Py_XDECREF(args);
return NULL;
}
@ -378,5 +386,7 @@ void _PyCodecRegistry_Init()
void _PyCodecRegistry_Fini()
{
Py_XDECREF(_PyCodec_SearchPath);
_PyCodec_SearchPath = NULL;
Py_XDECREF(_PyCodec_SearchCache);
_PyCodec_SearchCache = NULL;
}