From f606f87b31186e7614cbae8e5b8ef05700f6267e Mon Sep 17 00:00:00 2001
From: Raymond Hettinger <python@rcn.com>
Date: Sun, 16 Mar 2003 03:11:04 +0000
Subject: [PATCH] Introduced macros for a simple opcode prediction protocol.

Applied to common cases:
    COMPARE_OP is often followed by a JUMP_IF.
    JUMP_IF is usually followed by POP_TOP.

Shows improved timings on PyStone, PyBench, and specific tests
using timeit.py:
    python timeit.py -s "x=1" "if x==1: pass"
    python timeit.py -s "x=1" "if x==2: pass"
    python timeit.py -s "x=1" "if x: pass"
    python timeit.py -s "x=100" "while x!=1: x-=1"

Potential future candidates:
    GET_ITER predicts FOR_ITER
    FOR_ITER predicts STORE_FAST or UNPACK_SEQUENCE

Also, applied missing goto fast_next_opcode to DUP_TOPX.
---
 Python/ceval.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/Python/ceval.c b/Python/ceval.c
index 324008df0e6..384bfbe48ed 100644
--- a/Python/ceval.c
+++ b/Python/ceval.c
@@ -602,6 +602,26 @@ eval_frame(PyFrameObject *f)
 #define JUMPTO(x)	(next_instr = first_instr + (x))
 #define JUMPBY(x)	(next_instr += (x))
 
+/* OpCode prediction macros
+	Some opcodes tend to come in pairs thus making it possible to predict
+	the second code when the first is run.  For example, COMPARE_OP is often
+	followed by JUMP_IF_FALSE or JUMP_IF_TRUE.  And, those opcodes are often
+	followed by a POP_TOP.
+
+	Verifying the prediction costs a single high-speed test of register
+	variable against a constant.  If the pairing was good, then the odds
+	processor has a high likelihood of making its own successful branch
+	prediction which results in a nearly zero overhead transition to the
+	next opcode.
+
+	A successful prediction saves a trip through the eval-loop including
+	its two unpredictable branches, the HASARG test and the switch-case.
+*/
+
+#define PREDICT(op)	if (*next_instr == op) goto PRED_##op
+#define PREDICTED(op)		PRED_##op: next_instr++
+#define PREDICTED_WITH_ARG(op)	PRED_##op: oparg = (next_instr += 3, (next_instr[-1]<<8) + next_instr[-2])
+
 /* Stack manipulation macros */
 
 #define STACK_LEVEL()	(stack_pointer - f->f_valuestack)
@@ -873,6 +893,7 @@ eval_frame(PyFrameObject *f)
 			SETLOCAL(oparg, v);
 			goto fast_next_opcode;
 
+		PREDICTED(POP_TOP);
 		case POP_TOP:
 			v = POP();
 			Py_DECREF(v);
@@ -920,7 +941,7 @@ eval_frame(PyFrameObject *f)
 				STACKADJ(2);
 				SET_TOP(x);
 				SET_SECOND(w);
-				continue;
+				goto fast_next_opcode;
 			} else if (oparg == 3) {
 				x = TOP();
 				Py_INCREF(x);
@@ -932,7 +953,7 @@ eval_frame(PyFrameObject *f)
 				SET_TOP(x);
 				SET_SECOND(w);
 				SET_THIRD(v);
-				continue;
+				goto fast_next_opcode;
 			}
 			Py_FatalError("invalid argument to DUP_TOPX"
 				      " (bytecode corruption?)");
@@ -1918,8 +1939,10 @@ eval_frame(PyFrameObject *f)
 			Py_DECREF(v);
 			Py_DECREF(w);
 			SET_TOP(x);
-			if (x != NULL) continue;
-			break;
+			if (x == NULL) break;
+			PREDICT(JUMP_IF_FALSE);
+			PREDICT(JUMP_IF_TRUE);
+			continue;
 
 		case IMPORT_NAME:
 			w = GETITEM(names, oparg);
@@ -1974,10 +1997,13 @@ eval_frame(PyFrameObject *f)
 			JUMPBY(oparg);
 			goto fast_next_opcode;
 
+		PREDICTED_WITH_ARG(JUMP_IF_FALSE);
 		case JUMP_IF_FALSE:
 			w = TOP();
-			if (w == Py_True)
+			if (w == Py_True) {
+				PREDICT(POP_TOP);
 				goto fast_next_opcode;
+			}
 			if (w == Py_False) {
 				JUMPBY(oparg);
 				goto fast_next_opcode;
@@ -1991,10 +2017,13 @@ eval_frame(PyFrameObject *f)
 				break;
 			continue;
 
+		PREDICTED_WITH_ARG(JUMP_IF_TRUE);
 		case JUMP_IF_TRUE:
 			w = TOP();
-			if (w == Py_False)
+			if (w == Py_False) {
+				PREDICT(POP_TOP);
 				goto fast_next_opcode;
+			}
 			if (w == Py_True) {
 				JUMPBY(oparg);
 				goto fast_next_opcode;