diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 0592221f15226e..9835585cd1a60b 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -16,12 +16,102 @@ extern "C" { #include +typedef struct _PyJitUopBuffer { + _PyUOpInstruction *start; + _PyUOpInstruction *next; + _PyUOpInstruction *end; +} _PyJitUopBuffer; + + +typedef struct _JitOptContext { + char done; + char out_of_space; + bool contradiction; + // Has the builtins dict been watched? + bool builtins_watched; + // The current "executing" frame. + _Py_UOpsAbstractFrame *frame; + _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH]; + int curr_frame_depth; + + // Arena for the symbolic types. + ty_arena t_arena; + + JitOptRef *n_consumed; + JitOptRef *limit; + JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE]; + _PyJitUopBuffer out_buffer; +} JitOptContext; + + +static inline void +uop_buffer_init(_PyJitUopBuffer *trace, _PyUOpInstruction *start, uint32_t size) +{ + trace->next = trace->start = start; + trace->end = start + size; +} + +static inline _PyUOpInstruction * +uop_buffer_last(_PyJitUopBuffer *trace) +{ + assert(trace->next > trace->start); + return trace->next-1; +} + +static inline int +uop_buffer_length(_PyJitUopBuffer *trace) +{ + return (int)(trace->next - trace->start); +} + +static inline int +uop_buffer_remaining_space(_PyJitUopBuffer *trace) +{ + return (int)(trace->end - trace->next); +} + +typedef struct _PyJitTracerInitialState { + int stack_depth; + int chain_depth; + struct _PyExitData *exit; + PyCodeObject *code; // Strong + PyFunctionObject *func; // Strong + struct _PyExecutorObject *executor; // Strong + _Py_CODEUNIT *start_instr; + _Py_CODEUNIT *close_loop_instr; + _Py_CODEUNIT *jump_backward_instr; +} _PyJitTracerInitialState; + +typedef struct _PyJitTracerPreviousState { + bool dependencies_still_valid; + int instr_oparg; + int instr_stacklevel; + _Py_CODEUNIT *instr; + PyCodeObject *instr_code; // Strong + struct _PyInterpreterFrame *instr_frame; + _PyBloomFilter dependencies; +} _PyJitTracerPreviousState; + +typedef struct _PyJitTracerTranslatorState { + int jump_backward_seen; +} _PyJitTracerTranslatorState; + +typedef struct _PyJitTracerState { + bool is_tracing; + _PyJitTracerInitialState initial_state; + _PyJitTracerPreviousState prev_state; + _PyJitTracerTranslatorState translator_state; + JitOptContext opt_context; + _PyJitUopBuffer code_buffer; + _PyJitUopBuffer out_buffer; + _PyUOpInstruction uop_array[2 * UOP_MAX_TRACE_LENGTH]; +} _PyJitTracerState; + typedef struct _PyExecutorLinkListNode { struct _PyExecutorObject *next; struct _PyExecutorObject *previous; } _PyExecutorLinkListNode; - typedef struct { uint8_t opcode; uint8_t oparg; @@ -86,8 +176,8 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); int _Py_uop_analyze_and_optimize( _PyThreadStateImpl *tstate, - _PyUOpInstruction *trace, int trace_len, int curr_stackentries, - _PyBloomFilter *dependencies); + _PyUOpInstruction *input, int trace_len, int curr_stackentries, + _PyUOpInstruction *output, _PyBloomFilter *dependencies); extern PyTypeObject _PyUOpExecutor_Type; diff --git a/Include/internal/pycore_optimizer_types.h b/Include/internal/pycore_optimizer_types.h index 6501ce869c1425..076c70c20eb712 100644 --- a/Include/internal/pycore_optimizer_types.h +++ b/Include/internal/pycore_optimizer_types.h @@ -112,27 +112,6 @@ typedef struct ty_arena { JitOptSymbol arena[TY_ARENA_SIZE]; } ty_arena; -typedef struct _JitOptContext { - char done; - char out_of_space; - bool contradiction; - // Has the builtins dict been watched? - bool builtins_watched; - // The current "executing" frame. - _Py_UOpsAbstractFrame *frame; - _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH]; - int curr_frame_depth; - - // Arena for the symbolic types. - ty_arena t_arena; - - JitOptRef *n_consumed; - JitOptRef *limit; - JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE]; - _PyUOpInstruction *out_buffer; - int out_len; -} JitOptContext; - #ifdef __cplusplus } diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 24a40416c2191b..64b90710b8e664 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -12,7 +12,6 @@ extern "C" { #include "pycore_freelist_state.h" // struct _Py_freelists #include "pycore_interpframe_structs.h" // _PyInterpreterFrame #include "pycore_mimalloc.h" // struct _mimalloc_thread_state -#include "pycore_optimizer_types.h" // JitOptContext #include "pycore_qsbr.h" // struct qsbr #include "pycore_uop.h" // struct _PyUOpInstruction #include "pycore_structs.h" @@ -24,46 +23,6 @@ struct _gc_thread_state { }; #endif -#if _Py_TIER2 -typedef struct _PyJitTracerInitialState { - int stack_depth; - int chain_depth; - struct _PyExitData *exit; - PyCodeObject *code; // Strong - PyFunctionObject *func; // Strong - struct _PyExecutorObject *executor; // Strong - _Py_CODEUNIT *start_instr; - _Py_CODEUNIT *close_loop_instr; - _Py_CODEUNIT *jump_backward_instr; -} _PyJitTracerInitialState; - -typedef struct _PyJitTracerPreviousState { - bool dependencies_still_valid; - int code_max_size; - int code_curr_size; - int instr_oparg; - int instr_stacklevel; - _Py_CODEUNIT *instr; - PyCodeObject *instr_code; // Strong - struct _PyInterpreterFrame *instr_frame; - _PyBloomFilter dependencies; -} _PyJitTracerPreviousState; - -typedef struct _PyJitTracerTranslatorState { - int jump_backward_seen; -} _PyJitTracerTranslatorState; - -typedef struct _PyJitTracerState { - bool is_tracing; - _PyJitTracerInitialState initial_state; - _PyJitTracerPreviousState prev_state; - _PyJitTracerTranslatorState translator_state; - JitOptContext opt_context; - _PyUOpInstruction code_buffer[UOP_MAX_TRACE_LENGTH]; - _PyUOpInstruction out_buffer[UOP_MAX_TRACE_LENGTH]; -} _PyJitTracerState; - -#endif // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The // PyThreadState fields are exposed as part of the C API, although most fields @@ -141,7 +100,7 @@ typedef struct _PyThreadStateImpl { Py_ssize_t reftotal; // this thread's total refcount operations #endif #if _Py_TIER2 - _PyJitTracerState *jit_tracer_state; + struct _PyJitTracerState *jit_tracer_state; #endif } _PyThreadStateImpl; diff --git a/Include/internal/pycore_uop.h b/Include/internal/pycore_uop.h index e828a1cc5a5722..f9be01acb57197 100644 --- a/Include/internal/pycore_uop.h +++ b/Include/internal/pycore_uop.h @@ -38,11 +38,10 @@ typedef struct _PyUOpInstruction{ // This is the length of the trace we translate initially. #ifdef Py_DEBUG // With asserts, the stencils are a lot larger -#define UOP_MAX_TRACE_LENGTH 2000 +#define UOP_MAX_TRACE_LENGTH 1000 #else -#define UOP_MAX_TRACE_LENGTH 5000 +#define UOP_MAX_TRACE_LENGTH 2500 #endif -#define UOP_BUFFER_SIZE (UOP_MAX_TRACE_LENGTH * sizeof(_PyUOpInstruction)) /* Bloom filter with m = 256 * https://en.wikipedia.org/wiki/Bloom_filter */ diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 3b4b3253b3638c..d791ba0e8eca97 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -433,7 +433,7 @@ do { \ JUMP_TO_LABEL(error); \ } \ if (keep_tracing_bit) { \ - assert(((_PyThreadStateImpl *)tstate)->jit_tracer_state->prev_state.code_curr_size == 2); \ + assert(uop_buffer_length(&((_PyThreadStateImpl *)tstate)->jit_tracer_state->code_buffer)); \ ENTER_TRACING(); \ DISPATCH_NON_TRACING(); \ } \ diff --git a/Python/optimizer.c b/Python/optimizer.c index ab0ef3db4e4882..a9409e72e757dd 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -188,9 +188,6 @@ _PyOptimizer_Optimize( } insert_executor(code, start, index, executor); } - else { - executor->vm_data.code = NULL; - } executor->vm_data.chain_depth = chain_depth; assert(executor->vm_data.valid); _PyExitData *exit = _tstate->jit_tracer_state->initial_state.exit; @@ -547,52 +544,43 @@ guard_ip_uop[MAX_UOP_ID + 1] = { #endif -static inline int +static inline void add_to_trace( - _PyUOpInstruction *trace, - int trace_length, + _PyJitUopBuffer *trace, uint16_t opcode, uint16_t oparg, uint64_t operand, uint32_t target) { - trace[trace_length].opcode = opcode; - trace[trace_length].format = UOP_FORMAT_TARGET; - trace[trace_length].target = target; - trace[trace_length].oparg = oparg; - trace[trace_length].operand0 = operand; + _PyUOpInstruction *inst = trace->next; + inst->opcode = opcode; + inst->format = UOP_FORMAT_TARGET; + inst->target = target; + inst->oparg = oparg; + inst->operand0 = operand; #ifdef Py_STATS - trace[trace_length].execution_count = 0; + inst->execution_count = 0; #endif - return trace_length + 1; + trace->next++; } + #ifdef Py_DEBUG #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \ - assert(trace_length < max_length); \ - trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); \ + add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET)); \ if (lltrace >= 2) { \ - printf("%4d ADD_TO_TRACE: ", trace_length); \ - _PyUOpPrint(&trace[trace_length-1]); \ + printf("%4d ADD_TO_TRACE: ", uop_buffer_length(trace)); \ + _PyUOpPrint(uop_buffer_last(trace)); \ printf("\n"); \ } #else #define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \ - assert(trace_length < max_length); \ - trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); + add_to_trace(trace, (OPCODE), (OPARG), (OPERAND), (TARGET)) #endif #define INSTR_IP(INSTR, CODE) \ ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) -// Reserve space for n uops -#define RESERVE_RAW(n, opname) \ - if (trace_length + (n) > max_length) { \ - DPRINTF(2, "No room for %s (need %d, got %d)\n", \ - (opname), (n), max_length - trace_length); \ - OPT_STAT_INC(trace_too_long); \ - goto full; \ - } static int is_terminator(const _PyUOpInstruction *uop) @@ -629,9 +617,7 @@ _PyJit_translate_single_bytecode_to_trace( PyCodeObject *old_code = tracer->prev_state.instr_code; bool progress_needed = (tracer->initial_state.chain_depth % MAX_CHAIN_DEPTH) == 0; _PyBloomFilter *dependencies = &tracer->prev_state.dependencies; - int trace_length = tracer->prev_state.code_curr_size; - _PyUOpInstruction *trace = tracer->code_buffer; - int max_length = tracer->prev_state.code_max_size; + _PyJitUopBuffer *trace = &tracer->code_buffer; _Py_CODEUNIT *this_instr = tracer->prev_state.instr; _Py_CODEUNIT *target_instr = this_instr; @@ -670,15 +656,13 @@ _PyJit_translate_single_bytecode_to_trace( } } - int old_stack_level = tracer->prev_state.instr_stacklevel; - // Strange control-flow bool has_dynamic_jump_taken = OPCODE_HAS_UNPREDICTABLE_JUMP(opcode) && (next_instr != this_instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]]); /* Special case the first instruction, * so that we can guarantee forward progress */ - if (progress_needed && tracer->prev_state.code_curr_size < CODE_SIZE_NO_PROGRESS) { + if (progress_needed && uop_buffer_length(&tracer->code_buffer) < CODE_SIZE_NO_PROGRESS) { if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) { opcode = _PyOpcode_Deopt[opcode]; } @@ -694,7 +678,7 @@ _PyJit_translate_single_bytecode_to_trace( int is_sys_tracing = (tstate->c_tracefunc != NULL) || (tstate->c_profilefunc != NULL); if (is_sys_tracing) { - goto full; + goto done; } if (stop_tracing_opcode == _DEOPT) { @@ -710,7 +694,7 @@ _PyJit_translate_single_bytecode_to_trace( goto done; } - DPRINTF(2, "%p %d: %s(%d) %d %d\n", old_code, target, _PyOpcode_OpName[opcode], oparg, needs_guard_ip, old_stack_level); + DPRINTF(2, "%p %d: %s(%d) %d\n", old_code, target, _PyOpcode_OpName[opcode], oparg, needs_guard_ip); #ifdef Py_DEBUG if (oparg > 255) { @@ -719,7 +703,7 @@ _PyJit_translate_single_bytecode_to_trace( #endif if (!tracer->prev_state.dependencies_still_valid) { - goto full; + goto done; } // This happens when a recursive call happens that we can't trace. Such as Python -> C -> Python calls @@ -734,16 +718,14 @@ _PyJit_translate_single_bytecode_to_trace( unsupported: { // Rewind to previous instruction and replace with _EXIT_TRACE. - _PyUOpInstruction *curr = &trace[trace_length-1]; - while (curr->opcode != _SET_IP && trace_length > 2) { - trace_length--; - curr = &trace[trace_length-1]; + _PyUOpInstruction *curr = uop_buffer_last(trace); + while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { + trace->next--; + curr = uop_buffer_last(trace); } - assert(curr->opcode == _SET_IP || trace_length == 2); + assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2); if (curr->opcode == _SET_IP) { int32_t old_target = (int32_t)uop_get_target(curr); - curr++; - trace_length++; curr->opcode = _DEOPT; curr->format = UOP_FORMAT_TARGET; curr->target = old_target; @@ -752,7 +734,6 @@ _PyJit_translate_single_bytecode_to_trace( } } - if (opcode == NOP) { return 1; } @@ -766,7 +747,7 @@ _PyJit_translate_single_bytecode_to_trace( } // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT - max_length -= 2; + trace->end -= 2; const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode]; @@ -775,18 +756,28 @@ _PyJit_translate_single_bytecode_to_trace( if (OPCODE_HAS_EXIT(opcode)) { - // Make space for side exit and final _EXIT_TRACE: - max_length--; + // Make space for side exit + trace->end--; } if (OPCODE_HAS_ERROR(opcode)) { - // Make space for error stub and final _EXIT_TRACE: - max_length--; + // Make space for error stub + trace->end--; + } + if (OPCODE_HAS_DEOPT(opcode)) { + // Make space for side exit + trace->end--; } // _GUARD_IP leads to an exit. - max_length -= needs_guard_ip; + trace->end -= needs_guard_ip; - RESERVE_RAW(expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode)), "uop and various checks"); + int space_needed = expansion->nuops + needs_guard_ip + 2 + (!OPCODE_HAS_NO_SAVE_IP(opcode)); + if (uop_buffer_remaining_space(trace) < space_needed) { + DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n", + space_needed, uop_buffer_remaining_space(trace)); + OPT_STAT_INC(trace_too_long); + goto done; + } ADD_TO_TRACE(_CHECK_VALIDITY, 0, 0, target); @@ -825,7 +816,7 @@ _PyJit_translate_single_bytecode_to_trace( { if ((next_instr != tracer->initial_state.close_loop_instr) && (next_instr != tracer->initial_state.start_instr) && - tracer->prev_state.code_curr_size > CODE_SIZE_NO_PROGRESS && + uop_buffer_length(&tracer->code_buffer) > CODE_SIZE_NO_PROGRESS && // For side exits, we don't want to terminate them early. tracer->initial_state.exit == NULL && // These are coroutines, and we want to unroll those usually. @@ -836,7 +827,7 @@ _PyJit_translate_single_bytecode_to_trace( // inner loop might start and let the traces rejoin. OPT_STAT_INC(inner_loop); ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); - trace[trace_length-1].operand1 = true; // is_control_flow + uop_buffer_last(trace)->operand1 = true; // is_control_flow DPRINTF(2, "JUMP_BACKWARD not to top ends trace %p %p %p\n", next_instr, tracer->initial_state.close_loop_instr, tracer->initial_state.start_instr); goto done; @@ -913,19 +904,19 @@ _PyJit_translate_single_bytecode_to_trace( } break; case OPERAND1_1: - assert(trace[trace_length-1].opcode == uop); + assert(uop_buffer_last(trace)->opcode == uop); operand = read_u16(&this_instr[offset].cache); - trace[trace_length-1].operand1 = operand; + uop_buffer_last(trace)->operand1 = operand; continue; case OPERAND1_2: - assert(trace[trace_length-1].opcode == uop); + assert(uop_buffer_last(trace)->opcode == uop); operand = read_u32(&this_instr[offset].cache); - trace[trace_length-1].operand1 = operand; + uop_buffer_last(trace)->operand1 = operand; continue; case OPERAND1_4: - assert(trace[trace_length-1].opcode == uop); + assert(uop_buffer_last(trace)->opcode == uop); operand = read_u64(&this_instr[offset].cache); - trace[trace_length-1].operand1 = operand; + uop_buffer_last(trace)->operand1 = operand; continue; default: fprintf(stderr, @@ -955,7 +946,7 @@ _PyJit_translate_single_bytecode_to_trace( } } ADD_TO_TRACE(uop, oparg, operand, target); - trace[trace_length - 1].operand1 = PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer - _PyFrame_Stackbase(frame))); + uop_buffer_last(trace)->operand1 = PyStackRef_IsNone(frame->f_executable) ? 2 : ((int)(frame->stackpointer - _PyFrame_Stackbase(frame))); break; } if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) { @@ -973,9 +964,9 @@ _PyJit_translate_single_bytecode_to_trace( } // End switch (opcode) if (needs_guard_ip) { - uint16_t guard_ip = guard_ip_uop[trace[trace_length-1].opcode]; + uint16_t guard_ip = guard_ip_uop[uop_buffer_last(trace)->opcode]; if (guard_ip == 0) { - DPRINTF(1, "Unknown uop needing guard ip %s\n", _PyOpcode_uop_name[trace[trace_length-1].opcode]); + DPRINTF(1, "Unknown uop needing guard ip %s\n", _PyOpcode_uop_name[uop_buffer_last(trace)->opcode]); Py_UNREACHABLE(); } ADD_TO_TRACE(guard_ip, 0, (uintptr_t)next_instr, 0); @@ -983,7 +974,7 @@ _PyJit_translate_single_bytecode_to_trace( // Loop back to the start int is_first_instr = tracer->initial_state.close_loop_instr == next_instr || tracer->initial_state.start_instr == next_instr; - if (is_first_instr && tracer->prev_state.code_curr_size > CODE_SIZE_NO_PROGRESS) { + if (is_first_instr && uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) { if (needs_guard_ip) { ADD_TO_TRACE(_SET_IP, 0, (uintptr_t)next_instr, 0); } @@ -991,27 +982,13 @@ _PyJit_translate_single_bytecode_to_trace( goto done; } DPRINTF(2, "Trace continuing\n"); - tracer->prev_state.code_curr_size = trace_length; - tracer->prev_state.code_max_size = max_length; return 1; done: DPRINTF(2, "Trace done\n"); - tracer->prev_state.code_curr_size = trace_length; - tracer->prev_state.code_max_size = max_length; - return 0; -full: - DPRINTF(2, "Trace full\n"); - if (!is_terminator(&tracer->code_buffer[trace_length-1])) { - // Undo the last few instructions. - trace_length = tracer->prev_state.code_curr_size; - max_length = tracer->prev_state.code_max_size; - // We previously reversed one. - max_length += 1; + if (!is_terminator(uop_buffer_last(trace))) { ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); - trace[trace_length-1].operand1 = true; // is_control_flow + uop_buffer_last(trace)->operand1 = true; // is_control_flow } - tracer->prev_state.code_curr_size = trace_length; - tracer->prev_state.code_max_size = max_length; return 0; } @@ -1059,11 +1036,12 @@ _PyJit_TryInitializeTracing( 2 * INSTR_IP(close_loop_instr, code), chain_depth); #endif - add_to_trace(tracer->code_buffer, 0, _START_EXECUTOR, 0, (uintptr_t)start_instr, INSTR_IP(start_instr, code)); - add_to_trace(tracer->code_buffer, 1, _MAKE_WARM, 0, 0, 0); - tracer->prev_state.code_curr_size = CODE_SIZE_EMPTY; + /* Set up tracing buffer*/ + _PyJitUopBuffer *trace = &tracer->code_buffer; + uop_buffer_init(trace, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH); + ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)start_instr, INSTR_IP(start_instr, code)); + ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0); - tracer->prev_state.code_max_size = UOP_MAX_TRACE_LENGTH/2; tracer->initial_state.start_instr = start_instr; tracer->initial_state.close_loop_instr = close_loop_instr; tracer->initial_state.code = (PyCodeObject *)Py_NewRef(code); @@ -1122,8 +1100,7 @@ _PyJit_FinalizeTracing(PyThreadState *tstate, int err) Py_CLEAR(tracer->initial_state.func); Py_CLEAR(tracer->initial_state.executor); Py_CLEAR(tracer->prev_state.instr_code); - tracer->prev_state.code_curr_size = CODE_SIZE_EMPTY; - tracer->prev_state.code_max_size = UOP_MAX_TRACE_LENGTH/2 - 1; + uop_buffer_init(&tracer->code_buffer, &tracer->uop_array[0], UOP_MAX_TRACE_LENGTH); tracer->is_tracing = false; } @@ -1137,7 +1114,6 @@ _PyJit_TracerFree(_PyThreadStateImpl *_tstate) } #undef RESERVE -#undef RESERVE_RAW #undef INSTR_IP #undef ADD_TO_TRACE #undef DPRINTF @@ -1467,39 +1443,47 @@ int effective_trace_length(_PyUOpInstruction *buffer, int length) static int -stack_allocate(_PyUOpInstruction *buffer, int length) +stack_allocate(_PyUOpInstruction *buffer, _PyUOpInstruction *output, int length) { assert(buffer[0].opcode == _START_EXECUTOR); - for (int i = length-1; i >= 0; i--) { - buffer[i*2+1] = buffer[i]; - buffer[i*2].format = UOP_FORMAT_TARGET; - buffer[i*2].oparg = 0; - buffer[i*2].target = 0; + /* The input buffer and output buffers will overlap. + Make sure that we can move instructions to the output + without overwriting the input. */ + if (buffer == output) { + // This can only happen if optimizer has not been run + for (int i = 0; i < length; i++) { + buffer[i + UOP_MAX_TRACE_LENGTH] = buffer[i]; + } + buffer += UOP_MAX_TRACE_LENGTH; + } + else { + assert(output + UOP_MAX_TRACE_LENGTH == buffer); } int depth = 0; + _PyUOpInstruction *write = output; for (int i = 0; i < length; i++) { - _PyUOpInstruction *spill_or_reload = &buffer[i*2]; - int uop = buffer[i*2+1].opcode; + int uop = buffer[i].opcode; if (uop == _NOP) { - // leave _NOPs to be cleaned up later - spill_or_reload->opcode = _NOP; continue; } int new_depth = _PyUop_Caching[uop].best[depth]; - if (new_depth == depth) { - spill_or_reload->opcode = _NOP; - } - else { - spill_or_reload->opcode = _PyUop_SpillsAndReloads[depth][new_depth]; + if (new_depth != depth) { + write->opcode = _PyUop_SpillsAndReloads[depth][new_depth]; + assert(write->opcode != 0); + write->format = UOP_FORMAT_TARGET; + write->oparg = 0; + write->target = 0; + write++; depth = new_depth; } + *write = buffer[i]; uint16_t new_opcode = _PyUop_Caching[uop].entries[depth].opcode; assert(new_opcode != 0); - assert(spill_or_reload->opcode != 0); - buffer[i*2+1].opcode = new_opcode; + write->opcode = new_opcode; + write++; depth = _PyUop_Caching[uop].entries[depth].output; } - return length*2; + return write - output; } static int @@ -1512,7 +1496,7 @@ uop_optimize( _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; assert(_tstate->jit_tracer_state != NULL); _PyBloomFilter *dependencies = &_tstate->jit_tracer_state->prev_state.dependencies; - _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer; + _PyUOpInstruction *buffer = _tstate->jit_tracer_state->code_buffer.start; OPT_STAT_INC(attempts); char *env_var = Py_GETENV("PYTHON_UOPS_OPTIMIZE"); bool is_noopt = true; @@ -1520,24 +1504,24 @@ uop_optimize( is_noopt = false; } int curr_stackentries = _tstate->jit_tracer_state->initial_state.stack_depth; - int length = _tstate->jit_tracer_state->prev_state.code_curr_size; + int length = uop_buffer_length(&_tstate->jit_tracer_state->code_buffer); if (length <= CODE_SIZE_NO_PROGRESS) { return 0; } assert(length > 0); - assert(length < UOP_MAX_TRACE_LENGTH/2); + assert(length < UOP_MAX_TRACE_LENGTH); OPT_STAT_INC(traces_created); if (!is_noopt) { + _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[UOP_MAX_TRACE_LENGTH]; length = _Py_uop_analyze_and_optimize( - _tstate, - buffer, length, - curr_stackentries, dependencies); + _tstate, buffer, length, curr_stackentries, + output, dependencies); if (length <= 0) { return length; } - buffer = _tstate->jit_tracer_state->out_buffer; + buffer = output; } - assert(length < UOP_MAX_TRACE_LENGTH/2); + assert(length < UOP_MAX_TRACE_LENGTH); assert(length >= 1); /* Fix up */ for (int pc = 0; pc < length; pc++) { @@ -1553,7 +1537,9 @@ uop_optimize( assert(_PyOpcode_uop_name[buffer[pc].opcode]); } OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist); - length = stack_allocate(buffer, length); + _PyUOpInstruction *output = &_tstate->jit_tracer_state->uop_array[0]; + length = stack_allocate(buffer, output, length); + buffer = output; length = prepare_for_execution(buffer, length); assert(length <= UOP_MAX_TRACE_LENGTH); _PyExecutorObject *executor = make_executor_from_uops( @@ -1711,6 +1697,7 @@ _Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_s { executor->vm_data.valid = true; executor->vm_data.pending_deletion = 0; + executor->vm_data.code = NULL; for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) { executor->vm_data.bloom.bits[i] = dependency_set->bits[i]; } diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index d635ebabf9007a..41e52da0a4d8c1 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -203,14 +203,14 @@ static inline void add_op(JitOptContext *ctx, _PyUOpInstruction *this_instr, uint16_t opcode, uint16_t oparg, uintptr_t operand0) { - _PyUOpInstruction *out = &ctx->out_buffer[ctx->out_len]; + _PyUOpInstruction *out = ctx->out_buffer.next; out->opcode = (opcode); out->format = this_instr->format; out->oparg = (oparg); out->target = this_instr->target; out->operand0 = (operand0); out->operand1 = this_instr->operand1; - ctx->out_len++; + ctx->out_buffer.next++; } /* Shortened forms for convenience, used in optimizer_bytecodes.c */ @@ -400,6 +400,7 @@ optimize_uops( _PyUOpInstruction *trace, int trace_len, int curr_stacklen, + _PyUOpInstruction *output, _PyBloomFilter *dependencies ) { @@ -410,7 +411,7 @@ optimize_uops( JitOptContext *ctx = &tstate->jit_tracer_state->opt_context; uint32_t opcode = UINT16_MAX; - ctx->out_buffer = tstate->jit_tracer_state->out_buffer; + uop_buffer_init(&ctx->out_buffer, output, UOP_MAX_TRACE_LENGTH); // Make sure that watchers are set up PyInterpreterState *interp = _PyInterpreterState_GET(); @@ -428,14 +429,20 @@ optimize_uops( ctx->curr_frame_depth++; ctx->frame = frame; - ctx->out_len = 0; - _PyUOpInstruction *this_instr = NULL; JitOptRef *stack_pointer = ctx->frame->stack_pointer; - for (int i = 0; !ctx->done; i++) { - assert(i < trace_len); + for (int i = 0; i < trace_len; i++) { this_instr = &trace[i]; + if (ctx->done) { + // Don't do any more optimization, but + // we still need to reach a terminator for corrctness. + *(ctx->out_buffer.next++) = *this_instr; + if (is_terminator_uop(this_instr)) { + break; + } + continue; + } int oparg = this_instr->oparg; opcode = this_instr->opcode; @@ -455,6 +462,8 @@ optimize_uops( } #endif + _PyUOpInstruction *out_ptr = ctx->out_buffer.next; + switch (opcode) { #include "optimizer_cases.c.h" @@ -464,8 +473,8 @@ optimize_uops( Py_UNREACHABLE(); } // If no ADD_OP was called during this iteration, copy the original instruction - if (ctx->out_len == i) { - ctx->out_buffer[ctx->out_len++] = *this_instr; + if (ctx->out_buffer.next == out_ptr) { + *(ctx->out_buffer.next++) = *this_instr; } assert(ctx->frame != NULL); if (!CURRENT_FRAME_IS_INIT_SHIM()) { @@ -496,20 +505,11 @@ optimize_uops( * would be no benefit in retrying later */ _Py_uop_abstractcontext_fini(ctx); // Check that the trace ends with a proper terminator - if (ctx->out_len > 0) { - _PyUOpInstruction *last_uop = &ctx->out_buffer[ctx->out_len - 1]; - if (!is_terminator_uop(last_uop)) { - // Copy remaining uops from original trace until we find a terminator - for (int i = ctx->out_len; i < trace_len; i++) { - ctx->out_buffer[ctx->out_len++] = trace[i]; - if (is_terminator_uop(&trace[i])) { - break; - } - } - } + if (uop_buffer_length(&ctx->out_buffer) > 0) { + assert(is_terminator_uop(uop_buffer_last(&ctx->out_buffer))); } - return ctx->out_len; + return uop_buffer_length(&ctx->out_buffer); error: DPRINTF(3, "\n"); @@ -666,14 +666,15 @@ _Py_uop_analyze_and_optimize( _PyUOpInstruction *buffer, int length, int curr_stacklen, + _PyUOpInstruction *output, _PyBloomFilter *dependencies ) { OPT_STAT_INC(optimizer_attempts); length = optimize_uops( - tstate, buffer, - length, curr_stacklen, dependencies); + tstate, buffer, length, curr_stacklen, + output, dependencies); if (length == 0) { return length; @@ -681,7 +682,7 @@ _Py_uop_analyze_and_optimize( assert(length > 0); - length = remove_unneeded_uops(tstate->jit_tracer_state->out_buffer, length); + length = remove_unneeded_uops(output, length); assert(length > 0); OPT_STAT_INC(optimizer_successes); diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index 876ba7c6de7482..505090daea443f 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -192,7 +192,6 @@ dummy_func(void) { _Py_BloomFilter_Add(dependencies, type); } } - } } @@ -796,7 +795,7 @@ dummy_func(void) { if (sym_is_const(ctx, callable) && sym_matches_type(callable, &PyFunction_Type)) { assert(PyFunction_Check(sym_get_const(ctx, callable))); ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version); - ctx->out_buffer[ctx->out_len - 1].operand1 = (uintptr_t)sym_get_const(ctx, callable); + uop_buffer_last(&ctx->out_buffer)->operand1 = (uintptr_t)sym_get_const(ctx, callable); } sym_set_type(callable, &PyFunction_Type); } @@ -806,7 +805,7 @@ dummy_func(void) { PyMethodObject *method = (PyMethodObject *)sym_get_const(ctx, callable); assert(PyMethod_Check(method)); ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version); - ctx->out_buffer[ctx->out_len - 1].operand1 = (uintptr_t)method->im_func; + uop_buffer_last(&ctx->out_buffer)->operand1 = (uintptr_t)method->im_func; } sym_set_type(callable, &PyMethod_Type); } @@ -1546,7 +1545,7 @@ dummy_func(void) { ctx->frame->globals_watched = true; } if (ctx->frame->globals_checked_version != version && this_instr[-1].opcode == _NOP) { - REPLACE_OP(&ctx->out_buffer[ctx->out_len - 1], _GUARD_GLOBALS_VERSION, 0, version); + REPLACE_OP(uop_buffer_last(&ctx->out_buffer), _GUARD_GLOBALS_VERSION, 0, version); ctx->frame->globals_checked_version = version; } if (ctx->frame->globals_checked_version == version) { diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 012fe16bfd9096..2bc567350df929 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1557,7 +1557,7 @@ ctx->frame->globals_watched = true; } if (ctx->frame->globals_checked_version != version && this_instr[-1].opcode == _NOP) { - REPLACE_OP(&ctx->out_buffer[ctx->out_len - 1], _GUARD_GLOBALS_VERSION, 0, version); + REPLACE_OP(uop_buffer_last(&ctx->out_buffer), _GUARD_GLOBALS_VERSION, 0, version); ctx->frame->globals_checked_version = version; } if (ctx->frame->globals_checked_version == version) { @@ -2861,7 +2861,7 @@ if (sym_is_const(ctx, callable) && sym_matches_type(callable, &PyFunction_Type)) { assert(PyFunction_Check(sym_get_const(ctx, callable))); ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version); - ctx->out_buffer[ctx->out_len - 1].operand1 = (uintptr_t)sym_get_const(ctx, callable); + uop_buffer_last(&ctx->out_buffer)->operand1 = (uintptr_t)sym_get_const(ctx, callable); } sym_set_type(callable, &PyFunction_Type); break; @@ -2879,7 +2879,7 @@ PyMethodObject *method = (PyMethodObject *)sym_get_const(ctx, callable); assert(PyMethod_Check(method)); ADD_OP(_CHECK_FUNCTION_VERSION_INLINE, 0, func_version); - ctx->out_buffer[ctx->out_len - 1].operand1 = (uintptr_t)method->im_func; + uop_buffer_last(&ctx->out_buffer)->operand1 = (uintptr_t)method->im_func; } sym_set_type(callable, &PyMethod_Type); break; diff --git a/Python/pystate.c b/Python/pystate.c index 86dee70734a097..4d1c73aad17942 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -24,7 +24,6 @@ #include "pycore_stackref.h" // Py_STACKREF_DEBUG #include "pycore_stats.h" // FT_STAT_WORLD_STOP_INC() #include "pycore_time.h" // _PyTime_Init() -#include "pycore_uop.h" // UOP_BUFFER_SIZE #include "pycore_uniqueid.h" // _PyObject_FinalizePerThreadRefcounts()