python · gvanrossum · Nov 24, 2023 · Nov 23, 2023 · Nov 24, 2023 · Nov 25, 2023
diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h
@@ -60,8 +60,8 @@ PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void);
 
 PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int offset);
 
-int
-_PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer);
+int _PyOptimizer_BackEdge(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer);
+int _PyOptimizer_Unanchored(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _PyExecutorObject **pexecutor, PyObject **stack_pointer);
 
 extern _PyOptimizerObject _PyOptimizer_Default;
 

diff --git a/Include/internal/pycore_uops.h b/Include/internal/pycore_uops.h
@@ -21,6 +21,9 @@ typedef struct{
 
 typedef struct{
  _PyExecutorObject base;
+ // Auxiliary arrays, allocated after trace[base.ob_size]
+ uint16_t *counters; // An array of counters
+ _PyExecutorObject **executors; // An array of executors
  _PyUOpInstruction trace[1];
 } _PyUOpExecutorObject;
 

diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py
@@ -539,6 +539,27 @@ def testfunc(n):
  # too much already.
  self.assertEqual(count, 1)
 
+ def test_side_exits(self):
+ def testfunc():
+ for _ in range(100):
+ for i in range(100):
+ if i >= 70:
+ i = 0
+
+ opt = _testinternalcapi.get_uop_optimizer()
+ with temporary_optimizer(opt):
+ testfunc()
+
+ ex = get_first_executor(testfunc)
+ self.assertIsNotNone(ex)
+ uops ={opname for opname, _, _ in ex}
+ self.assertIn("_GUARD_IS_FALSE_POP", uops)
+ subs = [sub for sub in ex.sub_executors() if sub is not None]
+ self.assertGreater(len(subs), 0)
+ sub = subs[0]
+ sub_uops ={opname for opname, _, _ in sub}
+ self.assertIn("_GUARD_IS_TRUE_POP", sub_uops)
+
 
 if __name__ == "__main__":
  unittest.main()
diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst b/Misc/NEWS.d/next/Core and Builtins/2023-12-13-15-30-19.gh-issue-112354.Z6yyTb.rst
@@ -0,0 +1,2 @@
+In the Tier 2 interpreter, add side exits to sub-executors for certain
+micro-opcodes (currently only conditional branches).
@@ -755,6 +755,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
  next_instr = frame->instr_ptr;
 resume_frame:
  stack_pointer = _PyFrame_GetStackPointer(frame);
+resume_frame_using_stack_pointer:
 
 #ifdef LLTRACE
  lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS());
@@ -1063,17 +1064,123 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
 
 // Jump here from DEOPT_IF()
 deoptimize:
-next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame));
+frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame));
  DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n",
  uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target,
  (int)(next_uop - current_executor->trace - 1),
  _PyOpcode_OpName[frame->instr_ptr->op.code]);
  OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
  UOP_STAT_INC(uopcode, miss);
- Py_DECREF(current_executor);
- DISPATCH();
+ frame->return_offset = 0; // Don't leave this random
+
+ // Check if there is a side-exit executor here already.
+ int pc = (int)(next_uop - 1 - current_executor->trace);
+ _PyExecutorObject **pexecutor = current_executor->executors + pc;
+ if (*pexecutor != NULL){
+#ifdef Py_DEBUG
+ PyCodeObject *code = _PyFrame_GetCode(frame);
+ DPRINTF(2, "Jumping to new executor for %s (%s:%d) at byte offset %d\n",
+ PyUnicode_AsUTF8(code->co_qualname),
+ PyUnicode_AsUTF8(code->co_filename),
+ code->co_firstlineno,
+ 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame))));
+#endif
+ _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor);
+ Py_DECREF(current_executor);
+ current_executor = new_executor;
+ goto enter_tier_two;
+ }
+
+ // Increment and check side exit counter.
+ // (Even though we only need it for certain opcodes.)
+ next_instr = frame->instr_ptr;
+ uint16_t *pcounter = current_executor->counters + pc;
+ *pcounter += 1 << OPTIMIZER_BITS_IN_COUNTER;
+ /* We are using unsigned values, but we really want signed values, so
+ * do the 2s complement comparison manually */
+ uint16_t ucounter = *pcounter + (1 << 15);
+ uint16_t threshold = tstate->interp->optimizer_resume_threshold + (1 << 15);
+ if (ucounter <= threshold)
+{
+ Py_DECREF(current_executor);
+ goto resume_frame_using_stack_pointer;
+ }
+
+ // Decode instruction to look past EXTENDED_ARG.
+ opcode = next_instr[0].op.code;
+ if (opcode == EXTENDED_ARG){
+ opcode = next_instr[1].op.code;
+ }
+
+ // For selected opcodes build a new executor and enter it now.
+ if (opcode == POP_JUMP_IF_FALSE ||
+ opcode == POP_JUMP_IF_TRUE ||
+ opcode == POP_JUMP_IF_NONE ||
+ opcode == POP_JUMP_IF_NOT_NONE)
+{
+ DPRINTF(2, "--> %s @ %d in %p has %d side exits\n",
+ _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter));
+ DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]);
+
+ _PyExecutorObject *tmp_executor = NULL;
+ int optimized = _PyOptimizer_Unanchored(frame, next_instr, &tmp_executor, stack_pointer);
+ if (optimized < 0){
+ goto error_tier_two;
+ }
 
+ if (!optimized){
+ DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n",
+ _PyUOpName(uopcode), pc, current_executor);
+ }
+ else{
+#ifdef Py_DEBUG
+ DPRINTF(1, "--> Optimized %s @ %d in %p\n",
+ _PyUOpName(uopcode), pc, current_executor);
+ PyCodeObject *code = _PyFrame_GetCode(frame);
+ DPRINTF(2, "Jumping to fresh executor for %s (%s:%d) at byte offset %d\n",
+ PyUnicode_AsUTF8(code->co_qualname),
+ PyUnicode_AsUTF8(code->co_filename),
+ code->co_firstlineno,
+ 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame))));
+#endif
+ _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(tmp_executor);
+
+ // Reject trace if it repeats the uop that just deoptimized.
+ int jump_opcode = new_executor->trace[0].opcode;
+ if (jump_opcode == _IS_NONE){
+ jump_opcode = new_executor->trace[1].opcode;
+ }
+ if (jump_opcode != uopcode){
+ *pexecutor = tmp_executor;
+ *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
+ Py_DECREF(current_executor);
+ current_executor = new_executor;
+ goto enter_tier_two; // All systems go!
+ }
+
+ // The trace is guaranteed to deopt again; forget about it.
+ DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n",
+ _PyUOpName(jump_opcode));
+ Py_DECREF(tmp_executor);
+ Py_DECREF(new_executor);
+ }
+ }
+
+ // Exponential backoff if we didn't optimize.
+ int backoff = *pcounter & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1);
+ if (backoff < MINIMUM_TIER2_BACKOFF){
+ backoff = MINIMUM_TIER2_BACKOFF;
+ }
+ else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER){
+ backoff++;
+ }
+ assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER);
+ *pcounter = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff;
+
+ Py_DECREF(current_executor);
+ goto resume_frame_using_stack_pointer;
 }
+
 #if defined(__GNUC__)
 # pragma GCC diagnostic pop
 #elif defined(_MSC_VER) /* MS_WINDOWS */

@@ -159,7 +159,7 @@ int
 _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNIT *dest, PyObject **stack_pointer)
 {
  assert(src->op.code == JUMP_BACKWARD);
- PyCodeObject *code = (PyCodeObject *)frame->f_executable;
+ PyCodeObject *code = _PyFrame_GetCode(frame);
  assert(PyCode_Check(code));
  PyInterpreterState *interp = _PyInterpreterState_GET();
  if (!has_space_for_executor(code, src)){
@@ -189,6 +189,27 @@ _PyOptimizer_BackEdge(_PyInterpreterFrame *frame, _Py_CODEUNIT *src, _Py_CODEUNI
  return 1;
 }
 
+// Return an unanchored executor. The caller owns the executor when returning 1.
+// No ENTER_EXECUTOR is inserted, nor is the executor added to the code object.
+int
+_PyOptimizer_Unanchored(
+ _PyInterpreterFrame *frame,
+ _Py_CODEUNIT *instr,
+ _PyExecutorObject **pexecutor,
+ PyObject **stack_pointer)
+{
+ assert(instr->op.code != ENTER_EXECUTOR);
+ PyCodeObject *code = _PyFrame_GetCode(frame);
+ assert(PyCode_Check(code));
+ PyInterpreterState *interp = _PyInterpreterState_GET();
+ _PyOptimizerObject *opt = interp->optimizer;
+ if (strcmp(opt->ob_base.ob_type->tp_name, "uop_optimizer") != 0){
+ return 0;
+ }
+ *pexecutor = NULL;
+ return opt->optimize(opt, code, instr, pexecutor, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
+}
+
 _PyExecutorObject *
 PyUnstable_GetExecutor(PyCodeObject *code, int offset)
 {
@@ -321,6 +342,11 @@ PyUnstable_Optimizer_NewCounter(void)
 static void
 uop_dealloc(_PyUOpExecutorObject *self){
  _Py_ExecutorClear((_PyExecutorObject *)self);
+ if (self->executors != NULL){
+ for (Py_ssize_t i = Py_SIZE(self); --i >= 0; ){
+ Py_XDECREF(self->executors[i]);
+ }
+ }
  PyObject_Free(self);
 }
 
@@ -375,15 +401,41 @@ PySequenceMethods uop_as_sequence ={
  .sq_item = (ssizeargfunc)uop_item,
 };
 
+static PyObject *
+sub_executors(PyObject *self, PyObject *Py_UNUSED(ignored))
+{
+ _PyUOpExecutorObject *executor = (_PyUOpExecutorObject *)self;
+ Py_ssize_t len = uop_len(executor);
+ PyObject *list = PyList_New(len);
+ if (list == NULL){
+ return NULL;
+ }
+ for (Py_ssize_t i = 0; i < len; i++){
+ PyObject *sub = (PyObject *)executor->executors[i];
+ if (sub == NULL){
+ sub = Py_None;
+ }
+ Py_INCREF(sub);
+ PyList_SET_ITEM(list, i, (PyObject *)sub);
+ }
+ return list;
+}
+
+static PyMethodDef uop_executor_methods[] ={
+{"is_valid", is_valid, METH_NOARGS, NULL },
+{"sub_executors", sub_executors, METH_NOARGS, NULL },
+{NULL, NULL },
+};
+
 PyTypeObject _PyUOpExecutor_Type ={
  PyVarObject_HEAD_INIT(&PyType_Type, 0)
  .tp_name = "uop_executor",
  .tp_basicsize = sizeof(_PyUOpExecutorObject) - sizeof(_PyUOpInstruction),
- .tp_itemsize = sizeof(_PyUOpInstruction),
+ .tp_itemsize = sizeof(_PyUOpInstruction) + sizeof(uint16_t) + sizeof(_PyExecutorObject *),
  .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
  .tp_dealloc = (destructor)uop_dealloc,
  .tp_as_sequence = &uop_as_sequence,
- .tp_methods = executor_methods,
+ .tp_methods = uop_executor_methods,
 };
 
 /* TO DO -- Generate these tables */
@@ -499,7 +551,7 @@ translate_bytecode_to_trace(
  code = trace_stack[trace_stack_depth].code; \
  instr = trace_stack[trace_stack_depth].instr;
 
- DPRINTF(4,
+ DPRINTF(2,
  "Optimizing %s (%s:%d) at byte offset %d\n",
  PyUnicode_AsUTF8(code->co_qualname),
  PyUnicode_AsUTF8(code->co_filename),
@@ -825,6 +877,10 @@ make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
  if (executor == NULL){
  return NULL;
  }
+ executor->executors = (_PyExecutorObject **)(&executor->trace[length]);
+ executor->counters = (uint16_t *)(&executor->executors[length]);
+ memset(executor->executors, 0, sizeof(_PyExecutorObject *) * length);
+ memset(executor->counters, 0, sizeof(uint16_t) * length);
  int dest = length - 1;
  /* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
  for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--){
@@ -933,9 +989,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
  return NULL;
  }
  opt->optimize = uop_optimize;
- opt->resume_threshold = INT16_MAX;
- // Need at least 3 iterations to settle specializations.
- // A few lower bits of the counter are reserved for other flags.
+ // The lower bits are reserved for exponential backoff.
+ opt->resume_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER;
  opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER;
  return (PyObject *)opt;
 }

diff --git a/Python/specialize.c b/Python/specialize.c
@@ -2353,6 +2353,7 @@ int
 void
 _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg)
 {
+ assert(_PyOpcode_Deopt[instr->op.code] == FOR_ITER);
  assert(ENABLE_SPECIALIZATION);
  assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER);
  _PyForIterCache *cache = (_PyForIterCache *)(instr + 1);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		In the Tier 2 interpreter, add side exits to sub-executors for certain
		micro-opcodes (currently only conditional branches).