Uh oh!
There was an error while loading. Please reload this page.
- Notifications
You must be signed in to change notification settings - Fork 34k
gh-112354: Add executor for less-taken branch#112902
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Uh oh!
There was an error while loading. Please reload this page.
Changes from all commits
36feeb1d12533bf21f2d884639656403752329deadf1998c0b0944e626b5f89649581c835bf13256b15675c7c32a94c7f1747a3f0682cf5a359c6fcca6ed3ae2a26b538c7aab0f6423183297dfd065a940f71a03075ab91c54daef934a11552c49eb8f5e62310b98f11450ca6dcde4d315df63fc786418ee0734b655a84132e36faf5b317a4804a3c46c7d26b991279File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading. Please reload this page.
Jump to
Uh oh!
There was an error while loading. Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| In the Tier 2 interpreter, add side exits to sub-executors for certain | ||
| micro-opcodes (currently only conditional branches). |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -755,6 +755,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | ||
| next_instr = frame->instr_ptr; | ||
gvanrossum marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| resume_frame: | ||
| stack_pointer = _PyFrame_GetStackPointer(frame); | ||
| resume_frame_using_stack_pointer: | ||
| #ifdef LLTRACE | ||
| lltrace = maybe_lltrace_resume_frame(frame, &entry_frame, GLOBALS()); | ||
| @@ -1063,17 +1064,123 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int | ||
| // Jump here from DEOPT_IF() | ||
| deoptimize: | ||
| next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); | ||
| frame->instr_ptr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); | ||
| DPRINTF(2, "DEOPT: [UOp %d (%s), oparg %d, operand %" PRIu64 ", target %d @ %d -> %s]\n", | ||
| uopcode, _PyUOpName(uopcode), next_uop[-1].oparg, next_uop[-1].operand, next_uop[-1].target, | ||
| (int)(next_uop - current_executor->trace - 1), | ||
| _PyOpcode_OpName[frame->instr_ptr->op.code]); | ||
| OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); | ||
| UOP_STAT_INC(uopcode, miss); | ||
| Py_DECREF(current_executor); | ||
| DISPATCH(); | ||
| frame->return_offset = 0; // Don't leave this random | ||
| // Check if there is a side-exit executor here already. | ||
| int pc = (int)(next_uop - 1 - current_executor->trace); | ||
| _PyExecutorObject **pexecutor = current_executor->executors + pc; | ||
| if (*pexecutor != NULL){ | ||
| #ifdef Py_DEBUG | ||
| PyCodeObject *code = _PyFrame_GetCode(frame); | ||
| DPRINTF(2, "Jumping to new executor for %s (%s:%d) at byte offset %d\n", | ||
| PyUnicode_AsUTF8(code->co_qualname), | ||
| PyUnicode_AsUTF8(code->co_filename), | ||
| code->co_firstlineno, | ||
| 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); | ||
| #endif | ||
| _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(*pexecutor); | ||
| Py_DECREF(current_executor); | ||
| current_executor = new_executor; | ||
| goto enter_tier_two; | ||
| } | ||
| // Increment and check side exit counter. | ||
| // (Even though we only need it for certain opcodes.) | ||
| next_instr = frame->instr_ptr; | ||
| uint16_t *pcounter = current_executor->counters + pc; | ||
| *pcounter += 1 << OPTIMIZER_BITS_IN_COUNTER; | ||
| /* We are using unsigned values, but we really want signed values, so | ||
| * do the 2s complement comparison manually */ | ||
| uint16_t ucounter = *pcounter + (1 << 15); | ||
| uint16_t threshold = tstate->interp->optimizer_resume_threshold + (1 << 15); | ||
| if (ucounter <= threshold) | ||
| { | ||
| Py_DECREF(current_executor); | ||
| goto resume_frame_using_stack_pointer; | ||
| } | ||
| // Decode instruction to look past EXTENDED_ARG. | ||
| opcode = next_instr[0].op.code; | ||
| if (opcode == EXTENDED_ARG){ | ||
| opcode = next_instr[1].op.code; | ||
| } | ||
| // For selected opcodes build a new executor and enter it now. | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why "selected opcodes", why not everywhere? MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In an earlier version that somehow didn't work. Right now the check whether the new trace isn't going to immediately deopt again relies on these opcodes. I figured once we have the side exit machinery working we could gradually increase the scope to other deoptimizations. Also, not all deoptimizations are worthy of the effort (e.g. the PEP 523 test). Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No special cases, please, it just make the code more complicated and slower. MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are several reasons. First, as I explain below, for bytecodes other than branches, I can't promise an exact check for whether the newly created sub-executor doesn't just repeat the same deoptimizing uop that triggered its creation (in which case the sub-executor would always deopt immediately if it is entered at all). Second, for most bytecodes other than branches, deoptimization paths are relatively rare (IIRC this is apparent from the pystats data -- with the exception of some For branches, we expect many cases where the "common" path is not much more common than the "uncommon" path (e.g. 60/40 or 70/30). Now, it might make sense have a different special case here, where if e.g. I propose this PR as a starting point for futher iterations, not as the ultimate design for side-exits. Let's discuss this Monday. | ||
| if (opcode == POP_JUMP_IF_FALSE || | ||
| opcode == POP_JUMP_IF_TRUE || | ||
| opcode == POP_JUMP_IF_NONE || | ||
| opcode == POP_JUMP_IF_NOT_NONE) | ||
| { | ||
| DPRINTF(2, "--> %s @ %d in %p has %d side exits\n", | ||
| _PyUOpName(uopcode), pc, current_executor, (int)(*pcounter)); | ||
| DPRINTF(2, " T1: %s\n", _PyOpcode_OpName[opcode]); | ||
| _PyExecutorObject *tmp_executor = NULL; | ||
| int optimized = _PyOptimizer_Unanchored(frame, next_instr, &tmp_executor, stack_pointer); | ||
| if (optimized < 0){ | ||
| goto error_tier_two; | ||
| } | ||
| if (!optimized){ | ||
| DPRINTF(2, "--> Failed to optimize %s @ %d in %p\n", | ||
| _PyUOpName(uopcode), pc, current_executor); | ||
| } | ||
| else{ | ||
| #ifdef Py_DEBUG | ||
| DPRINTF(1, "--> Optimized %s @ %d in %p\n", | ||
| _PyUOpName(uopcode), pc, current_executor); | ||
| PyCodeObject *code = _PyFrame_GetCode(frame); | ||
| DPRINTF(2, "Jumping to fresh executor for %s (%s:%d) at byte offset %d\n", | ||
| PyUnicode_AsUTF8(code->co_qualname), | ||
| PyUnicode_AsUTF8(code->co_filename), | ||
| code->co_firstlineno, | ||
| 2 * (int)(frame->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(frame)))); | ||
| #endif | ||
| _PyUOpExecutorObject *new_executor = (_PyUOpExecutorObject *)Py_NewRef(tmp_executor); | ||
| // Reject trace if it repeats the uop that just deoptimized. | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test may be a bit imprecise(*), but it tries to discard the case where, even though the counter in the executor indicated that this side exit is "hot", the Tier 1 bytecode hasn't been re-specialized yet. In that case the trace projection will just repeat the uop that just took a deopt side exit, causing it to immediately deopt again. This seems a waste of time and executors -- eventually the sub-executor's deopt counter will also indicate it is hot, and then we'll try again, but it seems better (if we can catch it) to avoid creating the sub-executor in the first place, relying on exponential backoff for the side-exit counter instead (implemented below at L1180 and ff.). For various reasons, the side-exit counters and the Tier 1 deopt counters don't run in sync, so it's possible that the side-exit counter triggers before the Tier 1 counter has re-specialized. This check gives that another chance. The test that I would like to use here would be check if the Tier 1 opcode is still unchanged (i.e., not re-specialized), but the executor doesn't record that information (and it would take up a lot of space, we'd need an extra byte for each uop that can deoptimize at least). (*) The test I wrote is exact for the conditional branches I special-cased above (that's why there's a further special case here for | ||
| int jump_opcode = new_executor->trace[0].opcode; | ||
| if (jump_opcode == _IS_NONE){ | ||
| jump_opcode = new_executor->trace[1].opcode; | ||
| } | ||
| if (jump_opcode != uopcode){ | ||
| *pexecutor = tmp_executor; | ||
| *pcounter &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); | ||
| Py_DECREF(current_executor); | ||
| current_executor = new_executor; | ||
| goto enter_tier_two; // All systems go! | ||
| } | ||
| // The trace is guaranteed to deopt again; forget about it. | ||
Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it? Why? MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See explanation above. | ||
| DPRINTF(2, "Alas, it's the same uop again (%s) -- discarding trace\n", | ||
| _PyUOpName(jump_opcode)); | ||
| Py_DECREF(tmp_executor); | ||
| Py_DECREF(new_executor); | ||
| } | ||
| } | ||
| // Exponential backoff if we didn't optimize. | ||
| int backoff = *pcounter & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); | ||
| if (backoff < MINIMUM_TIER2_BACKOFF){ | ||
| backoff = MINIMUM_TIER2_BACKOFF; | ||
| } | ||
| else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER){ | ||
| backoff++; | ||
| } | ||
| assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); | ||
| *pcounter = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff; | ||
| Py_DECREF(current_executor); | ||
| goto resume_frame_using_stack_pointer; | ||
| } | ||
| #if defined(__GNUC__) | ||
| # pragma GCC diagnostic pop | ||
| #elif defined(_MSC_VER) /* MS_WINDOWS */ | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -2353,6 +2353,7 @@ int | ||
| void | ||
| _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg) | ||
| { | ||
| assert(_PyOpcode_Deopt[instr->op.code] == FOR_ITER); | ||
MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should really add such asserts to many specialization functions; I ran into this one during an intense debugging session. Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The assert can be MemberAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried that and I get a Bus error. And of course it's not supposed to be called with something else! But a logic error in my early prototype caused that to happen, and it took me quite a while to track it down. | ||
| assert(ENABLE_SPECIALIZATION); | ||
| assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER); | ||
| _PyForIterCache *cache = (_PyForIterCache *)(instr + 1); | ||
Uh oh!
There was an error while loading. Please reload this page.