Issue #5433: Excessive newline detection optimization in IncrementalNewlineDecoder

This commit is contained in:
Antoine Pitrou 2009-03-06 23:40:56 +00:00
parent 2db74c2412
commit 66913e2213
2 changed files with 43 additions and 12 deletions

View File

@ -1915,6 +1915,19 @@ def test_newline_decoder(self):
decoder = self.IncrementalNewlineDecoder(decoder, translate=True) decoder = self.IncrementalNewlineDecoder(decoder, translate=True)
self.check_newline_decoding_utf8(decoder) self.check_newline_decoding_utf8(decoder)
def test_newline_bytes(self):
# Issue 5433: Excessive optimization in IncrementalNewlineDecoder
def _check(dec):
self.assertEquals(dec.newlines, None)
self.assertEquals(dec.decode("\u0D00"), "\u0D00")
self.assertEquals(dec.newlines, None)
self.assertEquals(dec.decode("\u0A00"), "\u0A00")
self.assertEquals(dec.newlines, None)
dec = self.IncrementalNewlineDecoder(None, translate=False)
_check(dec)
dec = self.IncrementalNewlineDecoder(None, translate=True)
_check(dec)
class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest): class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest):
pass pass

View File

@ -305,22 +305,40 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
for the \r *byte* with the libc's optimized memchr. for the \r *byte* with the libc's optimized memchr.
*/ */
if (seennl == SEEN_LF || seennl == 0) { if (seennl == SEEN_LF || seennl == 0) {
int has_cr, has_lf; only_lf = !(memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
has_lf = (seennl == SEEN_LF) ||
(memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL);
has_cr = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
if (has_lf && !has_cr) {
only_lf = 1;
seennl = SEEN_LF;
}
} }
if (!self->translate) { if (only_lf) {
/* If not already seen, quick scan for a possible "\n" character.
(there's nothing else to be done, even when in translation mode)
*/
if (seennl == 0 &&
memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) {
Py_UNICODE *s, *end;
s = in_str;
end = in_str + len;
for (;;) {
Py_UNICODE c;
/* Fast loop for non-control characters */
while (*s > '\n')
s++;
c = *s++;
if (c == '\n') {
seennl |= SEEN_LF;
break;
}
if (s > end)
break;
}
}
/* Finished: we have scanned for newlines, and none of them
need translating */
}
else if (!self->translate) {
Py_UNICODE *s, *end; Py_UNICODE *s, *end;
/* We have already seen all newline types, no need to scan again */
if (seennl == SEEN_ALL) if (seennl == SEEN_ALL)
goto endscan; goto endscan;
if (only_lf)
goto endscan;
s = in_str; s = in_str;
end = in_str + len; end = in_str + len;
for (;;) { for (;;) {
@ -347,7 +365,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
endscan: endscan:
; ;
} }
else if (!only_lf) { else {
PyObject *translated = NULL; PyObject *translated = NULL;
Py_UNICODE *out_str; Py_UNICODE *out_str;
Py_UNICODE *in, *out, *end; Py_UNICODE *in, *out, *end;