Issue #5433: Excessive newline detection optimization in IncrementalNewlineDecoder

2009-03-06 23:40:56 +00:00 · 2009-03-06 23:40:56 +00:00 · 66913e2213
parent 2db74c2412
commit 66913e2213
2 changed files with 43 additions and 12 deletions
--- a/Lib/test/test_io.py
+++ b/Lib/test/test_io.py
@ -1915,6 +1915,19 @@ def test_newline_decoder(self):
        decoder = self.IncrementalNewlineDecoder(decoder, translate=True)
        self.check_newline_decoding_utf8(decoder)
    def test_newline_bytes(self):
        # Issue 5433: Excessive optimization in IncrementalNewlineDecoder
        def _check(dec):
            self.assertEquals(dec.newlines, None)
            self.assertEquals(dec.decode("\u0D00"), "\u0D00")
            self.assertEquals(dec.newlines, None)
            self.assertEquals(dec.decode("\u0A00"), "\u0A00")
            self.assertEquals(dec.newlines, None)
        dec = self.IncrementalNewlineDecoder(None, translate=False)
        _check(dec)
        dec = self.IncrementalNewlineDecoder(None, translate=True)
        _check(dec)
 class CIncrementalNewlineDecoderTest(IncrementalNewlineDecoderTest):
    pass
--- a/Modules/_textio.c
+++ b/Modules/_textio.c
@ -305,22 +305,40 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
           for the \r *byte* with the libc's optimized memchr.
           */
        if (seennl == SEEN_LF || seennl == 0) {
-            int has_cr, has_lf;
+            only_lf = !(memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
            has_lf = (seennl == SEEN_LF) ||
                    (memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL);
            has_cr = (memchr(in_str, '\r', len * sizeof(Py_UNICODE)) != NULL);
            if (has_lf && !has_cr) {
                only_lf = 1;
                seennl = SEEN_LF;
            }
        }
-        if (!self->translate) {
+        if (only_lf) {
            /* If not already seen, quick scan for a possible "\n" character.
               (there's nothing else to be done, even when in translation mode)
            */
            if (seennl == 0 &&
                memchr(in_str, '\n', len * sizeof(Py_UNICODE)) != NULL) {
                Py_UNICODE *s, *end;
                s = in_str;
                end = in_str + len;
                for (;;) {
                    Py_UNICODE c;
                    /* Fast loop for non-control characters */
                    while (*s > '\n')
                        s++;
                    c = *s++;
                    if (c == '\n') {
                        seennl |= SEEN_LF;
                        break;
                    }
                    if (s > end)
                        break;
                }
            }
            /* Finished: we have scanned for newlines, and none of them
               need translating */
        }
        else if (!self->translate) {
            Py_UNICODE *s, *end;
            /* We have already seen all newline types, no need to scan again */
            if (seennl == SEEN_ALL)
                goto endscan;
            if (only_lf)
                goto endscan;
            s = in_str;
            end = in_str + len;
            for (;;) {
@ -347,7 +365,7 @@ _PyIncrementalNewlineDecoder_decode(PyObject *_self,
        endscan:
            ;
        }
-        else if (!only_lf) {
+        else {
            PyObject *translated = NULL;
            Py_UNICODE *out_str;
            Py_UNICODE *in, *out, *end;