From 00e41defe8801ef37548fb60abacb3be13156d2a Mon Sep 17 00:00:00 2001 From: Thomas Wouters Date: Fri, 23 Feb 2007 19:56:57 +0000 Subject: [PATCH] Bytes literal. --- Include/Python-ast.h | 12 ++++++--- Include/opcode.h | 2 +- Lib/compiler/ast.py | 14 +++++++++++ Lib/compiler/pyassem.py | 1 + Lib/compiler/pycodegen.py | 4 +++ Lib/compiler/transformer.py | 6 +++-- Lib/opcode.py | 1 + Lib/test/test_bytes.py | 14 ++++++++++- Lib/test/test_compiler.py | 24 ++++++++++++++++++ Parser/Python.asdl | 1 + Parser/tokenizer.c | 8 ++++++ Python/Python-ast.c | 41 ++++++++++++++++++++++++++++-- Python/ast.c | 50 ++++++++++++++++++++++++++++++------- Python/ceval.c | 13 ++++++++++ Python/compile.c | 7 +++++- 15 files changed, 179 insertions(+), 19 deletions(-) diff --git a/Include/Python-ast.h b/Include/Python-ast.h index c2fabfb2c3e..66d7b52580e 100644 --- a/Include/Python-ast.h +++ b/Include/Python-ast.h @@ -176,9 +176,9 @@ struct _stmt { enum _expr_kind {BoolOp_kind=1, BinOp_kind=2, UnaryOp_kind=3, Lambda_kind=4, IfExp_kind=5, Dict_kind=6, Set_kind=7, ListComp_kind=8, GeneratorExp_kind=9, Yield_kind=10, Compare_kind=11, - Call_kind=12, Num_kind=13, Str_kind=14, Ellipsis_kind=15, - Attribute_kind=16, Subscript_kind=17, Name_kind=18, - List_kind=19, Tuple_kind=20}; + Call_kind=12, Num_kind=13, Str_kind=14, Bytes_kind=15, + Ellipsis_kind=16, Attribute_kind=17, Subscript_kind=18, + Name_kind=19, List_kind=20, Tuple_kind=21}; struct _expr { enum _expr_kind kind; union { @@ -254,6 +254,10 @@ struct _expr { string s; } Str; + struct { + string s; + } Bytes; + struct { expr_ty value; identifier attr; @@ -465,6 +469,8 @@ expr_ty _Py_Call(expr_ty func, asdl_seq * args, asdl_seq * keywords, expr_ty expr_ty _Py_Num(object n, int lineno, int col_offset, PyArena *arena); #define Str(a0, a1, a2, a3) _Py_Str(a0, a1, a2, a3) expr_ty _Py_Str(string s, int lineno, int col_offset, PyArena *arena); +#define Bytes(a0, a1, a2, a3) _Py_Bytes(a0, a1, a2, a3) +expr_ty _Py_Bytes(string s, int lineno, int col_offset, PyArena *arena); #define Ellipsis(a0, a1, a2) _Py_Ellipsis(a0, a1, a2) expr_ty _Py_Ellipsis(int lineno, int col_offset, PyArena *arena); #define Attribute(a0, a1, a2, a3, a4, a5) _Py_Attribute(a0, a1, a2, a3, a4, a5) diff --git a/Include/opcode.h b/Include/opcode.h index 007816d8b31..316ba4f715b 100644 --- a/Include/opcode.h +++ b/Include/opcode.h @@ -72,7 +72,7 @@ extern "C" { #define LOAD_LOCALS 82 #define RETURN_VALUE 83 #define IMPORT_STAR 84 - +#define MAKE_BYTES 85 #define YIELD_VALUE 86 #define POP_BLOCK 87 #define END_FINALLY 88 diff --git a/Lib/compiler/ast.py b/Lib/compiler/ast.py index bc283c08bd3..4794d66da6b 100644 --- a/Lib/compiler/ast.py +++ b/Lib/compiler/ast.py @@ -267,6 +267,20 @@ def getChildNodes(self): def __repr__(self): return "Break()" +class Bytes(Node): + def __init__(self, value, lineno=None): + self.value = value + self.lineno = lineno + + def getChildren(self): + return self.value, + + def getChildNodes(self): + return () + + def __repr__(self): + return "Bytes(%s)" % (repr(self.value),) + class CallFunc(Node): def __init__(self, node, args, star_args = None, dstar_args = None, lineno=None): self.node = node diff --git a/Lib/compiler/pyassem.py b/Lib/compiler/pyassem.py index cac899d2396..f665c543b03 100644 --- a/Lib/compiler/pyassem.py +++ b/Lib/compiler/pyassem.py @@ -792,6 +792,7 @@ def findDepth(self, insts, debug=0): 'DELETE_ATTR': -1, 'STORE_GLOBAL': -1, 'BUILD_MAP': 1, + 'MAKE_BYTES': 0, 'COMPARE_OP': -1, 'STORE_FAST': -1, 'IMPORT_STAR': -1, diff --git a/Lib/compiler/pycodegen.py b/Lib/compiler/pycodegen.py index 8db4e0de725..83fbc173ca6 100644 --- a/Lib/compiler/pycodegen.py +++ b/Lib/compiler/pycodegen.py @@ -930,6 +930,10 @@ def visitDiscard(self, node): def visitConst(self, node): self.emit('LOAD_CONST', node.value) + + def visitBytes(self, node): + self.emit('LOAD_CONST', node.value) + self.emit('MAKE_BYTES') def visitKeyword(self, node): self.emit('LOAD_CONST', node.name) diff --git a/Lib/compiler/transformer.py b/Lib/compiler/transformer.py index 5f2face4abd..79b702ce30e 100644 --- a/Lib/compiler/transformer.py +++ b/Lib/compiler/transformer.py @@ -745,9 +745,11 @@ def decode_literal(self, lit): return eval(lit) def atom_string(self, nodelist): - k = '' - for node in nodelist: + k = self.decode_literal(nodelist[0][1]) + for node in nodelist[1:]: k += self.decode_literal(node[1]) + if isinstance(k, bytes): + return Bytes(str(k), lineno=nodelist[0][2]) return Const(k, lineno=nodelist[0][2]) def atom_ellipsis(self, nodelist): diff --git a/Lib/opcode.py b/Lib/opcode.py index 1e15582bfd6..69982f2b503 100644 --- a/Lib/opcode.py +++ b/Lib/opcode.py @@ -111,6 +111,7 @@ def jabs_op(name, op): def_op('LOAD_LOCALS', 82) def_op('RETURN_VALUE', 83) def_op('IMPORT_STAR', 84) +def_op('MAKE_BYTES', 85) def_op('YIELD_VALUE', 86) def_op('POP_BLOCK', 87) def_op('END_FINALLY', 88) diff --git a/Lib/test/test_bytes.py b/Lib/test/test_bytes.py index 997122b41fe..4dee01b7f20 100644 --- a/Lib/test/test_bytes.py +++ b/Lib/test/test_bytes.py @@ -403,7 +403,19 @@ def test_join(self): self.assertEqual(bytes.join(tuple(lst)), bytes("abc")) self.assertEqual(bytes.join(iter(lst)), bytes("abc")) # XXX more... - + + def test_literal(self): + tests = [ + (b"Wonderful spam", u"Wonderful spam"), + (br"Wonderful spam too", u"Wonderful spam too"), + (b"\xaa\x00\000\200", u"\xaa\x00\000\200"), + (br"\xaa\x00\000\200", ur"\xaa\x00\000\200"), + ] + for b, s in tests: + self.assertEqual(b, bytes(s, 'latin-1')) + for c in range(128, 256): + self.assertRaises(SyntaxError, eval, + 'b"%s"' % chr(c)) # Optimizations: # __iter__? (optimization) diff --git a/Lib/test/test_compiler.py b/Lib/test/test_compiler.py index ab9a66045af..bbd75119006 100644 --- a/Lib/test/test_compiler.py +++ b/Lib/test/test_compiler.py @@ -187,6 +187,30 @@ def testWithAss(self): exec(c, dct) self.assertEquals(dct.get('result'), 1) + def testBytesLiteral(self): + c = compiler.compile("b'foo'", '', 'eval') + b = eval(c) + + c = compiler.compile('def f(b=b"foo"):\n' + ' b[0] += 1\n' + ' return b\n' + 'f(); f(); result = f()\n', + '', + 'exec') + dct = {} + exec(c, dct) + self.assertEquals(dct.get('result'), b"ioo") + + c = compiler.compile('def f():\n' + ' b = b"foo"\n' + ' b[0] += 1\n' + ' return b\n' + 'f(); f(); result = f()\n', + '', + 'exec') + dct = {} + exec(c, dct) + self.assertEquals(dct.get('result'), b"goo") NOLINENO = (compiler.ast.Module, compiler.ast.Stmt, compiler.ast.Discard) diff --git a/Parser/Python.asdl b/Parser/Python.asdl index ea11349bc96..fd47aa0bc51 100644 --- a/Parser/Python.asdl +++ b/Parser/Python.asdl @@ -60,6 +60,7 @@ module Python version "$Revision$" expr? starargs, expr? kwargs) | Num(object n) -- a number as a PyObject. | Str(string s) -- need to specify raw, unicode, etc? + | Bytes(string s) | Ellipsis -- other literals? bools? diff --git a/Parser/tokenizer.c b/Parser/tokenizer.c index 84b7232cf97..84bd60eace1 100644 --- a/Parser/tokenizer.c +++ b/Parser/tokenizer.c @@ -1244,6 +1244,14 @@ tok_get(register struct tok_state *tok, char **p_start, char **p_end) if (c == '"' || c == '\'') goto letter_quote; break; + case 'b': + case 'B': + c = tok_nextc(tok); + if (c == 'r' || c == 'R') + c = tok_nextc(tok); + if (c == '"' || c == '\'') + goto letter_quote; + break; } while (isalnum(c) || c == '_') { c = tok_nextc(tok); diff --git a/Python/Python-ast.c b/Python/Python-ast.c index ae3a3960487..390ba157e83 100644 --- a/Python/Python-ast.c +++ b/Python/Python-ast.c @@ -2,7 +2,7 @@ /* - __version__ 53731. + __version__ 53866. This module must be committed separately after each AST grammar change; The __version__ number is set to the revision number of the commit @@ -216,6 +216,10 @@ static PyTypeObject *Str_type; static char *Str_fields[]={ "s", }; +static PyTypeObject *Bytes_type; +static char *Bytes_fields[]={ + "s", +}; static PyTypeObject *Ellipsis_type; static PyTypeObject *Attribute_type; static char *Attribute_fields[]={ @@ -547,6 +551,8 @@ static int init_types(void) if (!Num_type) return 0; Str_type = make_type("Str", expr_type, Str_fields, 1); if (!Str_type) return 0; + Bytes_type = make_type("Bytes", expr_type, Bytes_fields, 1); + if (!Bytes_type) return 0; Ellipsis_type = make_type("Ellipsis", expr_type, NULL, 0); if (!Ellipsis_type) return 0; Attribute_type = make_type("Attribute", expr_type, Attribute_fields, 3); @@ -1586,6 +1592,27 @@ Str(string s, int lineno, int col_offset, PyArena *arena) return p; } +expr_ty +Bytes(string s, int lineno, int col_offset, PyArena *arena) +{ + expr_ty p; + if (!s) { + PyErr_SetString(PyExc_ValueError, + "field s is required for Bytes"); + return NULL; + } + p = (expr_ty)PyArena_Malloc(arena, sizeof(*p)); + if (!p) { + PyErr_NoMemory(); + return NULL; + } + p->kind = Bytes_kind; + p->v.Bytes.s = s; + p->lineno = lineno; + p->col_offset = col_offset; + return p; +} + expr_ty Ellipsis(int lineno, int col_offset, PyArena *arena) { @@ -2550,6 +2577,15 @@ ast2obj_expr(void* _o) goto failed; Py_DECREF(value); break; + case Bytes_kind: + result = PyType_GenericNew(Bytes_type, NULL, NULL); + if (!result) goto failed; + value = ast2obj_string(o->v.Bytes.s); + if (!value) goto failed; + if (PyObject_SetAttrString(result, "s", value) == -1) + goto failed; + Py_DECREF(value); + break; case Ellipsis_kind: result = PyType_GenericNew(Ellipsis_type, NULL, NULL); if (!result) goto failed; @@ -3089,7 +3125,7 @@ init_ast(void) if (PyDict_SetItemString(d, "AST", (PyObject*)AST_type) < 0) return; if (PyModule_AddIntConstant(m, "PyCF_ONLY_AST", PyCF_ONLY_AST) < 0) return; - if (PyModule_AddStringConstant(m, "__version__", "53731") < 0) + if (PyModule_AddStringConstant(m, "__version__", "53866") < 0) return; if (PyDict_SetItemString(d, "mod", (PyObject*)mod_type) < 0) return; if (PyDict_SetItemString(d, "Module", (PyObject*)Module_type) < 0) @@ -3155,6 +3191,7 @@ init_ast(void) if (PyDict_SetItemString(d, "Call", (PyObject*)Call_type) < 0) return; if (PyDict_SetItemString(d, "Num", (PyObject*)Num_type) < 0) return; if (PyDict_SetItemString(d, "Str", (PyObject*)Str_type) < 0) return; + if (PyDict_SetItemString(d, "Bytes", (PyObject*)Bytes_type) < 0) return; if (PyDict_SetItemString(d, "Ellipsis", (PyObject*)Ellipsis_type) < 0) return; if (PyDict_SetItemString(d, "Attribute", (PyObject*)Attribute_type) < diff --git a/Python/ast.c b/Python/ast.c index a7d5713169d..9d5caf87e4d 100644 --- a/Python/ast.c +++ b/Python/ast.c @@ -33,8 +33,9 @@ static expr_ty ast_for_testlist_gexp(struct compiling *, const node *); static expr_ty ast_for_call(struct compiling *, const node *, expr_ty); static PyObject *parsenumber(const char *); -static PyObject *parsestr(const char *s, const char *encoding); -static PyObject *parsestrplus(struct compiling *, const node *n); +static PyObject *parsestr(const node *n, const char *encoding, int *bytesmode); +static PyObject *parsestrplus(struct compiling *, const node *n, + int *bytesmode); #ifndef LINENO #define LINENO(n) ((n)->n_lineno) @@ -1383,6 +1384,7 @@ ast_for_atom(struct compiling *c, const node *n) | '{' [dictsetmaker] '}' | NAME | NUMBER | STRING+ */ node *ch = CHILD(n, 0); + int bytesmode = 0; switch (TYPE(ch)) { case NAME: @@ -1390,12 +1392,15 @@ ast_for_atom(struct compiling *c, const node *n) changed. */ return Name(NEW_IDENTIFIER(ch), Load, LINENO(n), n->n_col_offset, c->c_arena); case STRING: { - PyObject *str = parsestrplus(c, n); + PyObject *str = parsestrplus(c, n, &bytesmode); if (!str) return NULL; PyArena_AddPyObject(c->c_arena, str); - return Str(str, LINENO(n), n->n_col_offset, c->c_arena); + if (bytesmode) + return Bytes(str, LINENO(n), n->n_col_offset, c->c_arena); + else + return Str(str, LINENO(n), n->n_col_offset, c->c_arena); } case NUMBER: { PyObject *pynum = parsenumber(STR(ch)); @@ -3254,9 +3259,10 @@ decode_unicode(const char *s, size_t len, int rawmode, const char *encoding) * parsestr parses it, and returns the decoded Python string object. */ static PyObject * -parsestr(const char *s, const char *encoding) +parsestr(const node *n, const char *encoding, int *bytesmode) { size_t len; + const char *s = STR(n); int quote = Py_CHARMASK(*s); int rawmode = 0; int need_encoding; @@ -3267,6 +3273,10 @@ parsestr(const char *s, const char *encoding) quote = *++s; unicode = 1; } + if (quote == 'b' || quote == 'B') { + quote = *++s; + *bytesmode = 1; + } if (quote == 'r' || quote == 'R') { quote = *++s; rawmode = 1; @@ -3276,6 +3286,10 @@ parsestr(const char *s, const char *encoding) PyErr_BadInternalCall(); return NULL; } + if (unicode && *bytesmode) { + ast_error(n, "string cannot be both bytes and unicode"); + return NULL; + } s++; len = strlen(s); if (len > INT_MAX) { @@ -3300,7 +3314,18 @@ parsestr(const char *s, const char *encoding) return decode_unicode(s, len, rawmode, encoding); } #endif - need_encoding = (encoding != NULL && + if (*bytesmode) { + /* Disallow non-ascii characters (but not escapes) */ + const char *c; + for (c = s; *c; c++) { + if (Py_CHARMASK(*c) >= 0x80) { + ast_error(n, "bytes can only contain ASCII " + "literal characters."); + return NULL; + } + } + } + need_encoding = (!*bytesmode && encoding != NULL && strcmp(encoding, "utf-8") != 0 && strcmp(encoding, "iso-8859-1") != 0); if (rawmode || strchr(s, '\\') == NULL) { @@ -3332,18 +3357,25 @@ parsestr(const char *s, const char *encoding) * pasting the intermediate results together. */ static PyObject * -parsestrplus(struct compiling *c, const node *n) +parsestrplus(struct compiling *c, const node *n, int *bytesmode) { PyObject *v; int i; REQ(CHILD(n, 0), STRING); - if ((v = parsestr(STR(CHILD(n, 0)), c->c_encoding)) != NULL) { + v = parsestr(CHILD(n, 0), c->c_encoding, bytesmode); + if (v != NULL) { /* String literal concatenation */ for (i = 1; i < NCH(n); i++) { PyObject *s; - s = parsestr(STR(CHILD(n, i)), c->c_encoding); + int subbm = 0; + s = parsestr(CHILD(n, i), c->c_encoding, &subbm); if (s == NULL) goto onError; + if (*bytesmode != subbm) { + ast_error(n, "cannot mix bytes and nonbytes" + "literals"); + goto onError; + } if (PyString_Check(v) && PyString_Check(s)) { PyString_ConcatAndDel(&v, s); if (v == NULL) diff --git a/Python/ceval.c b/Python/ceval.c index 0194687e22e..5ceb743a80d 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1885,6 +1885,19 @@ PyEval_EvalFrameEx(PyFrameObject *f, int throwflag) PUSH(x); if (x != NULL) continue; break; + + case MAKE_BYTES: + w = POP(); + if (PyString_Check(w)) + x = PyBytes_FromStringAndSize( + PyString_AS_STRING(w), + PyString_GET_SIZE(w)); + else + x = NULL; + Py_DECREF(w); + PUSH(x); + if (x != NULL) continue; + break; case LOAD_ATTR: w = GETITEM(names, oparg); diff --git a/Python/compile.c b/Python/compile.c index 927569a9287..9655765f4c6 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -789,6 +789,8 @@ opcode_stack_effect(int opcode, int oparg) return 1-oparg; case BUILD_MAP: return 1; + case MAKE_BYTES: + return 0; case LOAD_ATTR: return 0; case COMPARE_OP: @@ -3077,6 +3079,10 @@ compiler_visit_expr(struct compiler *c, expr_ty e) case Str_kind: ADDOP_O(c, LOAD_CONST, e->v.Str.s, consts); break; + case Bytes_kind: + ADDOP_O(c, LOAD_CONST, e->v.Bytes.s, consts); + ADDOP(c, MAKE_BYTES); + break; case Ellipsis_kind: ADDOP_O(c, LOAD_CONST, Py_Ellipsis, consts); break; @@ -3426,7 +3432,6 @@ compiler_visit_slice(struct compiler *c, slice_ty s, expr_context_ty ctx) return compiler_handle_subscr(c, kindname, ctx); } - /* End of the compiler section, beginning of the assembler section */ /* do depth-first search of basic block graph, starting with block.