cpython/Python/optimizer.c
mpage 09c240f20c
gh-115999: Specialize LOAD_GLOBAL in free-threaded builds (#126607)
Enable specialization of LOAD_GLOBAL in free-threaded builds.

Thread-safety of specialization in free-threaded builds is provided by the following:

A critical section is held on both the globals and builtins objects during specialization. This ensures we get an atomic view of both builtins and globals during specialization.
Generation of new keys versions is made atomic in free-threaded builds.
Existing helpers are used to atomically modify the opcode.
Thread-safety of specialized instructions in free-threaded builds is provided by the following:

Relaxed atomics are used when loading and storing dict keys versions. This avoids potential data races as the dict keys versions are read without holding the dictionary's per-object lock in version guards.
Dicts keys objects are passed from keys version guards to the downstream uops. This ensures that we are loading from the correct offset in the keys object. Once a unicode key has been stored in a keys object for a combined dictionary in free-threaded builds, the offset that it is stored in will never be reused for a different key. Once the version guard passes, we know that we are reading from the correct offset.
The dictionary read fast-path is used to read values from the dictionary once we know the correct offset.
2024-11-21 11:22:21 -08:00

1713 lines
59 KiB
C

#ifdef _Py_TIER2
#include "Python.h"
#include "opcode.h"
#include "pycore_interp.h"
#include "pycore_backoff.h"
#include "pycore_bitutils.h" // _Py_popcount32()
#include "pycore_object.h" // _PyObject_GC_UNTRACK()
#include "pycore_opcode_metadata.h" // _PyOpcode_OpName[]
#include "pycore_opcode_utils.h" // MAX_REAL_OPCODE
#include "pycore_optimizer.h" // _Py_uop_analyze_and_optimize()
#include "pycore_pystate.h" // _PyInterpreterState_GET()
#include "pycore_uop_ids.h"
#include "pycore_jit.h"
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#define NEED_OPCODE_METADATA
#include "pycore_uop_metadata.h" // Uop tables
#undef NEED_OPCODE_METADATA
#define MAX_EXECUTORS_SIZE 256
static bool
has_space_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
{
if (instr->op.code == ENTER_EXECUTOR) {
return true;
}
if (code->co_executors == NULL) {
return true;
}
return code->co_executors->size < MAX_EXECUTORS_SIZE;
}
static int32_t
get_index_for_executor(PyCodeObject *code, _Py_CODEUNIT *instr)
{
if (instr->op.code == ENTER_EXECUTOR) {
return instr->op.arg;
}
_PyExecutorArray *old = code->co_executors;
int size = 0;
int capacity = 0;
if (old != NULL) {
size = old->size;
capacity = old->capacity;
assert(size < MAX_EXECUTORS_SIZE);
}
assert(size <= capacity);
if (size == capacity) {
/* Array is full. Grow array */
int new_capacity = capacity ? capacity * 2 : 4;
_PyExecutorArray *new = PyMem_Realloc(
old,
offsetof(_PyExecutorArray, executors) +
new_capacity * sizeof(_PyExecutorObject *));
if (new == NULL) {
return -1;
}
new->capacity = new_capacity;
new->size = size;
code->co_executors = new;
}
assert(size < code->co_executors->capacity);
return size;
}
static void
insert_executor(PyCodeObject *code, _Py_CODEUNIT *instr, int index, _PyExecutorObject *executor)
{
Py_INCREF(executor);
if (instr->op.code == ENTER_EXECUTOR) {
assert(index == instr->op.arg);
_Py_ExecutorDetach(code->co_executors->executors[index]);
}
else {
assert(code->co_executors->size == index);
assert(code->co_executors->capacity > index);
code->co_executors->size++;
}
executor->vm_data.opcode = instr->op.code;
executor->vm_data.oparg = instr->op.arg;
executor->vm_data.code = code;
executor->vm_data.index = (int)(instr - _PyCode_CODE(code));
code->co_executors->executors[index] = executor;
assert(index < MAX_EXECUTORS_SIZE);
instr->op.code = ENTER_EXECUTOR;
instr->op.arg = index;
}
static int
never_optimize(
_PyOptimizerObject* self,
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec,
int Py_UNUSED(stack_entries),
bool Py_UNUSED(progress_needed))
{
// This may be called if the optimizer is reset
return 0;
}
PyTypeObject _PyDefaultOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "noop_optimizer",
.tp_basicsize = sizeof(_PyOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
};
static _PyOptimizerObject _PyOptimizer_Default = {
PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type)
.optimize = never_optimize,
};
_PyOptimizerObject *
_Py_GetOptimizer(void)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
if (interp->optimizer == &_PyOptimizer_Default) {
return NULL;
}
Py_INCREF(interp->optimizer);
return interp->optimizer;
}
static _PyExecutorObject *
make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies);
static const _PyBloomFilter EMPTY_FILTER = { 0 };
_PyOptimizerObject *
_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer)
{
if (optimizer == NULL) {
optimizer = &_PyOptimizer_Default;
}
_PyOptimizerObject *old = interp->optimizer;
if (old == NULL) {
old = &_PyOptimizer_Default;
}
Py_INCREF(optimizer);
interp->optimizer = optimizer;
return old;
}
int
_Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer);
Py_XDECREF(old);
return old == NULL ? -1 : 0;
}
/* Returns 1 if optimized, 0 if not optimized, and -1 for an error.
* If optimized, *executor_ptr contains a new reference to the executor
*/
int
_PyOptimizer_Optimize(
_PyInterpreterFrame *frame, _Py_CODEUNIT *start,
_PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth)
{
// The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
// make progress in order to avoid infinite loops or excessively-long
// side-exit chains. We can only insert the executor into the bytecode if
// this is true, since a deopt won't infinitely re-enter the executor:
chain_depth %= MAX_CHAIN_DEPTH;
bool progress_needed = chain_depth == 0;
PyCodeObject *code = _PyFrame_GetCode(frame);
assert(PyCode_Check(code));
PyInterpreterState *interp = _PyInterpreterState_GET();
if (progress_needed && !has_space_for_executor(code, start)) {
return 0;
}
_PyOptimizerObject *opt = interp->optimizer;
int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)), progress_needed);
if (err <= 0) {
return err;
}
assert(*executor_ptr != NULL);
if (progress_needed) {
int index = get_index_for_executor(code, start);
if (index < 0) {
/* Out of memory. Don't raise and assume that the
* error will show up elsewhere.
*
* If an optimizer has already produced an executor,
* it might get confused by the executor disappearing,
* but there is not much we can do about that here. */
Py_DECREF(*executor_ptr);
return 0;
}
insert_executor(code, start, index, *executor_ptr);
}
else {
(*executor_ptr)->vm_data.code = NULL;
}
(*executor_ptr)->vm_data.chain_depth = chain_depth;
assert((*executor_ptr)->vm_data.valid);
return 1;
}
static _PyExecutorObject *
get_executor_lock_held(PyCodeObject *code, int offset)
{
int code_len = (int)Py_SIZE(code);
for (int i = 0 ; i < code_len;) {
if (_PyCode_CODE(code)[i].op.code == ENTER_EXECUTOR && i*2 == offset) {
int oparg = _PyCode_CODE(code)[i].op.arg;
_PyExecutorObject *res = code->co_executors->executors[oparg];
Py_INCREF(res);
return res;
}
i += _PyInstruction_GetLength(code, i);
}
PyErr_SetString(PyExc_ValueError, "no executor at given byte offset");
return NULL;
}
_PyExecutorObject *
_Py_GetExecutor(PyCodeObject *code, int offset)
{
_PyExecutorObject *executor;
Py_BEGIN_CRITICAL_SECTION(code);
executor = get_executor_lock_held(code, offset);
Py_END_CRITICAL_SECTION();
return executor;
}
static PyObject *
is_valid(PyObject *self, PyObject *Py_UNUSED(ignored))
{
return PyBool_FromLong(((_PyExecutorObject *)self)->vm_data.valid);
}
static PyObject *
get_opcode(PyObject *self, PyObject *Py_UNUSED(ignored))
{
return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.opcode);
}
static PyObject *
get_oparg(PyObject *self, PyObject *Py_UNUSED(ignored))
{
return PyLong_FromUnsignedLong(((_PyExecutorObject *)self)->vm_data.oparg);
}
static PyMethodDef executor_methods[] = {
{ "is_valid", is_valid, METH_NOARGS, NULL },
{ "get_opcode", get_opcode, METH_NOARGS, NULL },
{ "get_oparg", get_oparg, METH_NOARGS, NULL },
{ NULL, NULL },
};
///////////////////// Experimental UOp Optimizer /////////////////////
static int executor_clear(_PyExecutorObject *executor);
static void unlink_executor(_PyExecutorObject *executor);
static void
uop_dealloc(_PyExecutorObject *self) {
_PyObject_GC_UNTRACK(self);
assert(self->vm_data.code == NULL);
unlink_executor(self);
#ifdef _Py_JIT
_PyJIT_Free(self);
#endif
PyObject_GC_Del(self);
}
const char *
_PyUOpName(int index)
{
if (index < 0 || index > MAX_UOP_ID) {
return NULL;
}
return _PyOpcode_uop_name[index];
}
#ifdef Py_DEBUG
void
_PyUOpPrint(const _PyUOpInstruction *uop)
{
const char *name = _PyUOpName(uop->opcode);
if (name == NULL) {
printf("<uop %d>", uop->opcode);
}
else {
printf("%s", name);
}
switch(uop->format) {
case UOP_FORMAT_TARGET:
printf(" (%d, target=%d, operand=%#" PRIx64,
uop->oparg,
uop->target,
(uint64_t)uop->operand0);
break;
case UOP_FORMAT_JUMP:
printf(" (%d, jump_target=%d, operand=%#" PRIx64,
uop->oparg,
uop->jump_target,
(uint64_t)uop->operand0);
break;
default:
printf(" (%d, Unknown format)", uop->oparg);
}
if (_PyUop_Flags[uop->opcode] & HAS_ERROR_FLAG) {
printf(", error_target=%d", uop->error_target);
}
printf(")");
}
#endif
static Py_ssize_t
uop_len(_PyExecutorObject *self)
{
return self->code_size;
}
static PyObject *
uop_item(_PyExecutorObject *self, Py_ssize_t index)
{
Py_ssize_t len = uop_len(self);
if (index < 0 || index >= len) {
PyErr_SetNone(PyExc_IndexError);
return NULL;
}
const char *name = _PyUOpName(self->trace[index].opcode);
if (name == NULL) {
name = "<nil>";
}
PyObject *oname = _PyUnicode_FromASCII(name, strlen(name));
if (oname == NULL) {
return NULL;
}
PyObject *oparg = PyLong_FromUnsignedLong(self->trace[index].oparg);
if (oparg == NULL) {
Py_DECREF(oname);
return NULL;
}
PyObject *target = PyLong_FromUnsignedLong(self->trace[index].target);
if (oparg == NULL) {
Py_DECREF(oparg);
Py_DECREF(oname);
return NULL;
}
PyObject *operand = PyLong_FromUnsignedLongLong(self->trace[index].operand0);
if (operand == NULL) {
Py_DECREF(target);
Py_DECREF(oparg);
Py_DECREF(oname);
return NULL;
}
PyObject *args[4] = { oname, oparg, target, operand };
return _PyTuple_FromArraySteal(args, 4);
}
PySequenceMethods uop_as_sequence = {
.sq_length = (lenfunc)uop_len,
.sq_item = (ssizeargfunc)uop_item,
};
static int
executor_traverse(PyObject *o, visitproc visit, void *arg)
{
_PyExecutorObject *executor = (_PyExecutorObject *)o;
for (uint32_t i = 0; i < executor->exit_count; i++) {
Py_VISIT(executor->exits[i].executor);
}
return 0;
}
static PyObject *
get_jit_code(PyObject *self, PyObject *Py_UNUSED(ignored))
{
#ifndef _Py_JIT
PyErr_SetString(PyExc_RuntimeError, "JIT support not enabled.");
return NULL;
#else
_PyExecutorObject *executor = (_PyExecutorObject *)self;
if (executor->jit_code == NULL || executor->jit_size == 0) {
Py_RETURN_NONE;
}
return PyBytes_FromStringAndSize(executor->jit_code, executor->jit_size);
#endif
}
static PyMethodDef uop_executor_methods[] = {
{ "is_valid", is_valid, METH_NOARGS, NULL },
{ "get_jit_code", get_jit_code, METH_NOARGS, NULL},
{ "get_opcode", get_opcode, METH_NOARGS, NULL },
{ "get_oparg", get_oparg, METH_NOARGS, NULL },
{ NULL, NULL },
};
static int
executor_is_gc(PyObject *o)
{
return !_Py_IsImmortal(o);
}
PyTypeObject _PyUOpExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_executor",
.tp_basicsize = offsetof(_PyExecutorObject, exits),
.tp_itemsize = 1,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
.tp_dealloc = (destructor)uop_dealloc,
.tp_as_sequence = &uop_as_sequence,
.tp_methods = uop_executor_methods,
.tp_traverse = executor_traverse,
.tp_clear = (inquiry)executor_clear,
.tp_is_gc = executor_is_gc,
};
/* TO DO -- Generate these tables */
static const uint16_t
_PyUOp_Replacements[MAX_UOP_ID + 1] = {
[_ITER_JUMP_RANGE] = _GUARD_NOT_EXHAUSTED_RANGE,
[_ITER_JUMP_LIST] = _GUARD_NOT_EXHAUSTED_LIST,
[_ITER_JUMP_TUPLE] = _GUARD_NOT_EXHAUSTED_TUPLE,
[_FOR_ITER] = _FOR_ITER_TIER_TWO,
};
static const uint8_t
is_for_iter_test[MAX_UOP_ID + 1] = {
[_GUARD_NOT_EXHAUSTED_RANGE] = 1,
[_GUARD_NOT_EXHAUSTED_LIST] = 1,
[_GUARD_NOT_EXHAUSTED_TUPLE] = 1,
[_FOR_ITER_TIER_TWO] = 1,
};
static const uint16_t
BRANCH_TO_GUARD[4][2] = {
[POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_TRUE_POP,
[POP_JUMP_IF_FALSE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_FALSE_POP,
[POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_FALSE_POP,
[POP_JUMP_IF_TRUE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_TRUE_POP,
[POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NOT_NONE_POP,
[POP_JUMP_IF_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NONE_POP,
[POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][0] = _GUARD_IS_NONE_POP,
[POP_JUMP_IF_NOT_NONE - POP_JUMP_IF_FALSE][1] = _GUARD_IS_NOT_NONE_POP,
};
#define CONFIDENCE_RANGE 1000
#define CONFIDENCE_CUTOFF 333
#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
if (lltrace >= (level)) { printf(__VA_ARGS__); }
#else
#define DPRINTF(level, ...)
#endif
static inline int
add_to_trace(
_PyUOpInstruction *trace,
int trace_length,
uint16_t opcode,
uint16_t oparg,
uint64_t operand,
uint32_t target)
{
trace[trace_length].opcode = opcode;
trace[trace_length].format = UOP_FORMAT_TARGET;
trace[trace_length].target = target;
trace[trace_length].oparg = oparg;
trace[trace_length].operand0 = operand;
return trace_length + 1;
}
#ifdef Py_DEBUG
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
assert(trace_length < max_length); \
trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET)); \
if (lltrace >= 2) { \
printf("%4d ADD_TO_TRACE: ", trace_length); \
_PyUOpPrint(&trace[trace_length-1]); \
printf("\n"); \
}
#else
#define ADD_TO_TRACE(OPCODE, OPARG, OPERAND, TARGET) \
assert(trace_length < max_length); \
trace_length = add_to_trace(trace, trace_length, (OPCODE), (OPARG), (OPERAND), (TARGET));
#endif
#define INSTR_IP(INSTR, CODE) \
((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive)))
// Reserve space for n uops
#define RESERVE_RAW(n, opname) \
if (trace_length + (n) > max_length) { \
DPRINTF(2, "No room for %s (need %d, got %d)\n", \
(opname), (n), max_length - trace_length); \
OPT_STAT_INC(trace_too_long); \
goto done; \
}
// Reserve space for N uops, plus 3 for _SET_IP, _CHECK_VALIDITY and _EXIT_TRACE
#define RESERVE(needed) RESERVE_RAW((needed) + 3, _PyUOpName(opcode))
// Trace stack operations (used by _PUSH_FRAME, _RETURN_VALUE)
#define TRACE_STACK_PUSH() \
if (trace_stack_depth >= TRACE_STACK_SIZE) { \
DPRINTF(2, "Trace stack overflow\n"); \
OPT_STAT_INC(trace_stack_overflow); \
return 0; \
} \
assert(func == NULL || func->func_code == (PyObject *)code); \
trace_stack[trace_stack_depth].func = func; \
trace_stack[trace_stack_depth].code = code; \
trace_stack[trace_stack_depth].instr = instr; \
trace_stack_depth++;
#define TRACE_STACK_POP() \
if (trace_stack_depth <= 0) { \
Py_FatalError("Trace stack underflow\n"); \
} \
trace_stack_depth--; \
func = trace_stack[trace_stack_depth].func; \
code = trace_stack[trace_stack_depth].code; \
assert(func == NULL || func->func_code == (PyObject *)code); \
instr = trace_stack[trace_stack_depth].instr;
/* Returns the length of the trace on success,
* 0 if it failed to produce a worthwhile trace,
* and -1 on an error.
*/
static int
translate_bytecode_to_trace(
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyUOpInstruction *trace,
int buffer_size,
_PyBloomFilter *dependencies, bool progress_needed)
{
bool first = true;
PyCodeObject *code = _PyFrame_GetCode(frame);
PyFunctionObject *func = _PyFrame_GetFunction(frame);
assert(PyFunction_Check(func));
PyCodeObject *initial_code = code;
_Py_BloomFilter_Add(dependencies, initial_code);
_Py_CODEUNIT *initial_instr = instr;
int trace_length = 0;
// Leave space for possible trailing _EXIT_TRACE
int max_length = buffer_size-2;
struct {
PyFunctionObject *func;
PyCodeObject *code;
_Py_CODEUNIT *instr;
} trace_stack[TRACE_STACK_SIZE];
int trace_stack_depth = 0;
int confidence = CONFIDENCE_RANGE; // Adjusted by branch instructions
bool jump_seen = false;
#ifdef Py_DEBUG
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
int lltrace = 0;
if (python_lltrace != NULL && *python_lltrace >= '0') {
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
}
#endif
DPRINTF(2,
"Optimizing %s (%s:%d) at byte offset %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
2 * INSTR_IP(initial_instr, code));
ADD_TO_TRACE(_START_EXECUTOR, 0, (uintptr_t)instr, INSTR_IP(instr, code));
ADD_TO_TRACE(_MAKE_WARM, 0, 0, 0);
uint32_t target = 0;
for (;;) {
target = INSTR_IP(instr, code);
// Need space for _DEOPT
max_length--;
uint32_t opcode = instr->op.code;
uint32_t oparg = instr->op.arg;
if (!first && instr == initial_instr) {
// We have looped around to the start:
RESERVE(1);
ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
goto done;
}
DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);
if (opcode == EXTENDED_ARG) {
instr++;
opcode = instr->op.code;
oparg = (oparg << 8) | instr->op.arg;
if (opcode == EXTENDED_ARG) {
instr--;
goto done;
}
}
if (opcode == ENTER_EXECUTOR) {
// We have a couple of options here. We *could* peek "underneath"
// this executor and continue tracing, which could give us a longer,
// more optimizeable trace (at the expense of lots of duplicated
// tier two code). Instead, we choose to just end here and stitch to
// the other trace, which allows a side-exit traces to rejoin the
// "main" trace periodically (and also helps protect us against
// pathological behavior where the amount of tier two code explodes
// for a medium-length, branchy code path). This seems to work
// better in practice, but in the future we could be smarter about
// what we do here:
goto done;
}
assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);
/* Special case the first instruction,
* so that we can guarantee forward progress */
if (first && progress_needed) {
assert(first);
if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
opcode = _PyOpcode_Deopt[opcode];
}
assert(!OPCODE_HAS_EXIT(opcode));
assert(!OPCODE_HAS_DEOPT(opcode));
}
if (OPCODE_HAS_EXIT(opcode)) {
// Make space for side exit and final _EXIT_TRACE:
RESERVE_RAW(2, "_EXIT_TRACE");
max_length--;
}
if (OPCODE_HAS_ERROR(opcode)) {
// Make space for error stub and final _EXIT_TRACE:
RESERVE_RAW(2, "_ERROR_POP_N");
max_length--;
}
switch (opcode) {
case POP_JUMP_IF_NONE:
case POP_JUMP_IF_NOT_NONE:
case POP_JUMP_IF_FALSE:
case POP_JUMP_IF_TRUE:
{
RESERVE(1);
int counter = instr[1].cache;
int bitcount = _Py_popcount32(counter);
int jump_likely = bitcount > 8;
/* If bitcount is 8 (half the jumps were taken), adjust confidence by 50%.
For values in between, adjust proportionally. */
if (jump_likely) {
confidence = confidence * bitcount / 16;
}
else {
confidence = confidence * (16 - bitcount) / 16;
}
uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_likely];
DPRINTF(2, "%d: %s(%d): counter=%04x, bitcount=%d, likely=%d, confidence=%d, uopcode=%s\n",
target, _PyOpcode_OpName[opcode], oparg,
counter, bitcount, jump_likely, confidence, _PyUOpName(uopcode));
if (confidence < CONFIDENCE_CUTOFF) {
DPRINTF(2, "Confidence too low (%d < %d)\n", confidence, CONFIDENCE_CUTOFF);
OPT_STAT_INC(low_confidence);
goto done;
}
_Py_CODEUNIT *next_instr = instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
_Py_CODEUNIT *target_instr = next_instr + oparg;
if (jump_likely) {
DPRINTF(2, "Jump likely (%04x = %d bits), continue at byte offset %d\n",
instr[1].cache, bitcount, 2 * INSTR_IP(target_instr, code));
instr = target_instr;
ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(next_instr, code));
goto top;
}
ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(target_instr, code));
break;
}
case JUMP_BACKWARD:
ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target);
_Py_FALLTHROUGH;
case JUMP_BACKWARD_NO_INTERRUPT:
{
instr += 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] - (int)oparg;
if (jump_seen) {
OPT_STAT_INC(inner_loop);
DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n");
goto done;
}
jump_seen = true;
goto top;
}
case JUMP_FORWARD:
{
RESERVE(0);
// This will emit two _SET_IP instructions; leave it to the optimizer
instr += oparg;
break;
}
case RESUME:
/* Use a special tier 2 version of RESUME_CHECK to allow traces to
* start with RESUME_CHECK */
ADD_TO_TRACE(_TIER2_RESUME_CHECK, 0, 0, target);
break;
default:
{
const struct opcode_macro_expansion *expansion = &_PyOpcode_macro_expansion[opcode];
if (expansion->nuops > 0) {
// Reserve space for nuops (+ _SET_IP + _EXIT_TRACE)
int nuops = expansion->nuops;
RESERVE(nuops + 1); /* One extra for exit */
int16_t last_op = expansion->uops[nuops-1].uop;
if (last_op == _RETURN_VALUE || last_op == _RETURN_GENERATOR || last_op == _YIELD_VALUE) {
// Check for trace stack underflow now:
// We can't bail e.g. in the middle of
// LOAD_CONST + _RETURN_VALUE.
if (trace_stack_depth == 0) {
DPRINTF(2, "Trace stack underflow\n");
OPT_STAT_INC(trace_stack_underflow);
goto done;
}
}
uint32_t orig_oparg = oparg; // For OPARG_TOP/BOTTOM
for (int i = 0; i < nuops; i++) {
oparg = orig_oparg;
uint32_t uop = expansion->uops[i].uop;
uint64_t operand = 0;
// Add one to account for the actual opcode/oparg pair:
int offset = expansion->uops[i].offset + 1;
switch (expansion->uops[i].size) {
case OPARG_FULL:
assert(opcode != JUMP_BACKWARD_NO_INTERRUPT && opcode != JUMP_BACKWARD);
break;
case OPARG_CACHE_1:
operand = read_u16(&instr[offset].cache);
break;
case OPARG_CACHE_2:
operand = read_u32(&instr[offset].cache);
break;
case OPARG_CACHE_4:
operand = read_u64(&instr[offset].cache);
break;
case OPARG_TOP: // First half of super-instr
oparg = orig_oparg >> 4;
break;
case OPARG_BOTTOM: // Second half of super-instr
oparg = orig_oparg & 0xF;
break;
case OPARG_SAVE_RETURN_OFFSET: // op=_SAVE_RETURN_OFFSET; oparg=return_offset
oparg = offset;
assert(uop == _SAVE_RETURN_OFFSET);
break;
case OPARG_REPLACED:
uop = _PyUOp_Replacements[uop];
assert(uop != 0);
#ifdef Py_DEBUG
{
uint32_t next_inst = target + 1 + INLINE_CACHE_ENTRIES_FOR_ITER + (oparg > 255);
uint32_t jump_target = next_inst + oparg;
assert(_Py_GetBaseCodeUnit(code, jump_target).op.code == END_FOR);
assert(_Py_GetBaseCodeUnit(code, jump_target+1).op.code == POP_TOP);
}
#endif
break;
default:
fprintf(stderr,
"opcode=%d, oparg=%d; nuops=%d, i=%d; size=%d, offset=%d\n",
opcode, oparg, nuops, i,
expansion->uops[i].size,
expansion->uops[i].offset);
Py_FatalError("garbled expansion");
}
if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) {
TRACE_STACK_POP();
/* Set the operand to the function or code object returned to,
* to assist optimization passes. (See _PUSH_FRAME below.)
*/
if (func != NULL) {
operand = (uintptr_t)func;
}
else if (code != NULL) {
operand = (uintptr_t)code | 1;
}
else {
operand = 0;
}
ADD_TO_TRACE(uop, oparg, operand, target);
DPRINTF(2,
"Returning to %s (%s:%d) at byte offset %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
2 * INSTR_IP(instr, code));
goto top;
}
if (uop == _PUSH_FRAME) {
assert(i + 1 == nuops);
if (opcode == FOR_ITER_GEN ||
opcode == LOAD_ATTR_PROPERTY ||
opcode == BINARY_SUBSCR_GETITEM ||
opcode == SEND_GEN)
{
DPRINTF(2, "Bailing due to dynamic target\n");
ADD_TO_TRACE(uop, oparg, 0, target);
ADD_TO_TRACE(_DYNAMIC_EXIT, 0, 0, 0);
goto done;
}
assert(_PyOpcode_Deopt[opcode] == CALL || _PyOpcode_Deopt[opcode] == CALL_KW);
int func_version_offset =
offsetof(_PyCallCache, func_version)/sizeof(_Py_CODEUNIT)
// Add one to account for the actual opcode/oparg pair:
+ 1;
uint32_t func_version = read_u32(&instr[func_version_offset].cache);
PyCodeObject *new_code = NULL;
PyFunctionObject *new_func =
_PyFunction_LookupByVersion(func_version, (PyObject **) &new_code);
DPRINTF(2, "Function: version=%#x; new_func=%p, new_code=%p\n",
(int)func_version, new_func, new_code);
if (new_code != NULL) {
if (new_code == code) {
// Recursive call, bail (we could be here forever).
DPRINTF(2, "Bailing on recursive call to %s (%s:%d)\n",
PyUnicode_AsUTF8(new_code->co_qualname),
PyUnicode_AsUTF8(new_code->co_filename),
new_code->co_firstlineno);
OPT_STAT_INC(recursive_call);
ADD_TO_TRACE(uop, oparg, 0, target);
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0);
goto done;
}
if (new_code->co_version != func_version) {
// func.__code__ was updated.
// Perhaps it may happen again, so don't bother tracing.
// TODO: Reason about this -- is it better to bail or not?
DPRINTF(2, "Bailing because co_version != func_version\n");
ADD_TO_TRACE(uop, oparg, 0, target);
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, 0);
goto done;
}
// Increment IP to the return address
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]] + 1;
TRACE_STACK_PUSH();
_Py_BloomFilter_Add(dependencies, new_code);
/* Set the operand to the callee's function or code object,
* to assist optimization passes.
* We prefer setting it to the function (for remove_globals())
* but if that's not available but the code is available,
* use the code, setting the low bit so the optimizer knows.
*/
if (new_func != NULL) {
operand = (uintptr_t)new_func;
}
else if (new_code != NULL) {
operand = (uintptr_t)new_code | 1;
}
else {
operand = 0;
}
ADD_TO_TRACE(uop, oparg, operand, target);
code = new_code;
func = new_func;
instr = _PyCode_CODE(code);
DPRINTF(2,
"Continuing in %s (%s:%d) at byte offset %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
2 * INSTR_IP(instr, code));
goto top;
}
DPRINTF(2, "Bail, new_code == NULL\n");
ADD_TO_TRACE(uop, oparg, 0, target);
ADD_TO_TRACE(_DYNAMIC_EXIT, 0, 0, 0);
goto done;
}
if (uop == _BINARY_OP_INPLACE_ADD_UNICODE) {
assert(i + 1 == nuops);
_Py_CODEUNIT *next_instr = instr + 1 + _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
assert(next_instr->op.code == STORE_FAST);
operand = next_instr->op.arg;
// Skip the STORE_FAST:
instr++;
}
// All other instructions
ADD_TO_TRACE(uop, oparg, operand, target);
}
break;
}
DPRINTF(2, "Unsupported opcode %s\n", _PyOpcode_OpName[opcode]);
OPT_UNSUPPORTED_OPCODE(opcode);
goto done; // Break out of loop
} // End default
} // End switch (opcode)
instr++;
// Add cache size for opcode
instr += _PyOpcode_Caches[_PyOpcode_Deopt[opcode]];
if (opcode == CALL_LIST_APPEND) {
assert(instr->op.code == POP_TOP);
instr++;
}
top:
// Jump here after _PUSH_FRAME or likely branches.
first = false;
} // End for (;;)
done:
while (trace_stack_depth > 0) {
TRACE_STACK_POP();
}
assert(code == initial_code);
// Skip short traces where we can't even translate a single instruction:
if (first) {
OPT_STAT_INC(trace_too_short);
DPRINTF(2,
"No trace for %s (%s:%d) at byte offset %d (no progress)\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
2 * INSTR_IP(initial_instr, code));
return 0;
}
if (!is_terminator(&trace[trace_length-1])) {
/* Allow space for _EXIT_TRACE */
max_length += 2;
ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target);
}
DPRINTF(1,
"Created a proto-trace for %s (%s:%d) at byte offset %d -- length %d\n",
PyUnicode_AsUTF8(code->co_qualname),
PyUnicode_AsUTF8(code->co_filename),
code->co_firstlineno,
2 * INSTR_IP(initial_instr, code),
trace_length);
OPT_HIST(trace_length, trace_length_hist);
return trace_length;
}
#undef RESERVE
#undef RESERVE_RAW
#undef INSTR_IP
#undef ADD_TO_TRACE
#undef DPRINTF
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
/* Count the number of unused uops and exits
*/
static int
count_exits(_PyUOpInstruction *buffer, int length)
{
int exit_count = 0;
for (int i = 0; i < length; i++) {
int opcode = buffer[i].opcode;
if (opcode == _EXIT_TRACE || opcode == _DYNAMIC_EXIT) {
exit_count++;
}
}
return exit_count;
}
static void make_exit(_PyUOpInstruction *inst, int opcode, int target)
{
inst->opcode = opcode;
inst->oparg = 0;
inst->operand0 = 0;
inst->format = UOP_FORMAT_TARGET;
inst->target = target;
}
/* Convert implicit exits, errors and deopts
* into explicit ones. */
static int
prepare_for_execution(_PyUOpInstruction *buffer, int length)
{
int32_t current_jump = -1;
int32_t current_jump_target = -1;
int32_t current_error = -1;
int32_t current_error_target = -1;
int32_t current_popped = -1;
int32_t current_exit_op = -1;
/* Leaving in NOPs slows down the interpreter and messes up the stats */
_PyUOpInstruction *copy_to = &buffer[0];
for (int i = 0; i < length; i++) {
_PyUOpInstruction *inst = &buffer[i];
if (inst->opcode != _NOP) {
if (copy_to != inst) {
*copy_to = *inst;
}
copy_to++;
}
}
length = (int)(copy_to - buffer);
int next_spare = length;
for (int i = 0; i < length; i++) {
_PyUOpInstruction *inst = &buffer[i];
int opcode = inst->opcode;
int32_t target = (int32_t)uop_get_target(inst);
if (_PyUop_Flags[opcode] & (HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) {
uint16_t exit_op = (_PyUop_Flags[opcode] & HAS_EXIT_FLAG) ?
_EXIT_TRACE : _DEOPT;
int32_t jump_target = target;
if (is_for_iter_test[opcode]) {
/* Target the POP_TOP immediately after the END_FOR,
* leaving only the iterator on the stack. */
int extended_arg = inst->oparg > 255;
int32_t next_inst = target + 1 + INLINE_CACHE_ENTRIES_FOR_ITER + extended_arg;
jump_target = next_inst + inst->oparg + 1;
}
if (jump_target != current_jump_target || current_exit_op != exit_op) {
make_exit(&buffer[next_spare], exit_op, jump_target);
current_exit_op = exit_op;
current_jump_target = jump_target;
current_jump = next_spare;
next_spare++;
}
buffer[i].jump_target = current_jump;
buffer[i].format = UOP_FORMAT_JUMP;
}
if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) {
int popped = (_PyUop_Flags[opcode] & HAS_ERROR_NO_POP_FLAG) ?
0 : _PyUop_num_popped(opcode, inst->oparg);
if (target != current_error_target || popped != current_popped) {
current_popped = popped;
current_error = next_spare;
current_error_target = target;
make_exit(&buffer[next_spare], _ERROR_POP_N, 0);
buffer[next_spare].oparg = popped;
buffer[next_spare].operand0 = target;
next_spare++;
}
buffer[i].error_target = current_error;
if (buffer[i].format == UOP_FORMAT_TARGET) {
buffer[i].format = UOP_FORMAT_JUMP;
buffer[i].jump_target = 0;
}
}
if (opcode == _JUMP_TO_TOP) {
assert(buffer[0].opcode == _START_EXECUTOR);
buffer[i].format = UOP_FORMAT_JUMP;
buffer[i].jump_target = 1;
}
}
return next_spare;
}
/* Executor side exits */
static _PyExecutorObject *
allocate_executor(int exit_count, int length)
{
int size = exit_count*sizeof(_PyExitData) + length*sizeof(_PyUOpInstruction);
_PyExecutorObject *res = PyObject_GC_NewVar(_PyExecutorObject, &_PyUOpExecutor_Type, size);
if (res == NULL) {
return NULL;
}
res->trace = (_PyUOpInstruction *)(res->exits + exit_count);
res->code_size = length;
res->exit_count = exit_count;
return res;
}
#ifdef Py_DEBUG
#define CHECK(PRED) \
if (!(PRED)) { \
printf(#PRED " at %d\n", i); \
assert(0); \
}
static int
target_unused(int opcode)
{
return (_PyUop_Flags[opcode] & (HAS_ERROR_FLAG | HAS_EXIT_FLAG | HAS_DEOPT_FLAG)) == 0;
}
static void
sanity_check(_PyExecutorObject *executor)
{
for (uint32_t i = 0; i < executor->exit_count; i++) {
_PyExitData *exit = &executor->exits[i];
CHECK(exit->target < (1 << 25));
}
bool ended = false;
uint32_t i = 0;
CHECK(executor->trace[0].opcode == _START_EXECUTOR);
for (; i < executor->code_size; i++) {
const _PyUOpInstruction *inst = &executor->trace[i];
uint16_t opcode = inst->opcode;
CHECK(opcode <= MAX_UOP_ID);
CHECK(_PyOpcode_uop_name[opcode] != NULL);
switch(inst->format) {
case UOP_FORMAT_TARGET:
CHECK(target_unused(opcode));
break;
case UOP_FORMAT_JUMP:
CHECK(inst->jump_target < executor->code_size);
break;
}
if (_PyUop_Flags[opcode] & HAS_ERROR_FLAG) {
CHECK(inst->format == UOP_FORMAT_JUMP);
CHECK(inst->error_target < executor->code_size);
}
if (is_terminator(inst)) {
ended = true;
i++;
break;
}
}
CHECK(ended);
for (; i < executor->code_size; i++) {
const _PyUOpInstruction *inst = &executor->trace[i];
uint16_t opcode = inst->opcode;
CHECK(
opcode == _DEOPT ||
opcode == _EXIT_TRACE ||
opcode == _ERROR_POP_N);
}
}
#undef CHECK
#endif
/* Makes an executor from a buffer of uops.
* Account for the buffer having gaps and NOPs by computing a "used"
* bit vector and only copying the used uops. Here "used" means reachable
* and not a NOP.
*/
static _PyExecutorObject *
make_executor_from_uops(_PyUOpInstruction *buffer, int length, const _PyBloomFilter *dependencies)
{
int exit_count = count_exits(buffer, length);
_PyExecutorObject *executor = allocate_executor(exit_count, length);
if (executor == NULL) {
return NULL;
}
/* Initialize exits */
for (int i = 0; i < exit_count; i++) {
executor->exits[i].executor = NULL;
executor->exits[i].temperature = initial_temperature_backoff_counter();
}
int next_exit = exit_count-1;
_PyUOpInstruction *dest = (_PyUOpInstruction *)&executor->trace[length];
assert(buffer[0].opcode == _START_EXECUTOR);
buffer[0].operand0 = (uint64_t)executor;
for (int i = length-1; i >= 0; i--) {
int opcode = buffer[i].opcode;
dest--;
*dest = buffer[i];
assert(opcode != _POP_JUMP_IF_FALSE && opcode != _POP_JUMP_IF_TRUE);
if (opcode == _EXIT_TRACE) {
_PyExitData *exit = &executor->exits[next_exit];
exit->target = buffer[i].target;
dest->operand0 = (uint64_t)exit;
next_exit--;
}
if (opcode == _DYNAMIC_EXIT) {
_PyExitData *exit = &executor->exits[next_exit];
exit->target = 0;
dest->operand0 = (uint64_t)exit;
next_exit--;
}
}
assert(next_exit == -1);
assert(dest == executor->trace);
assert(dest->opcode == _START_EXECUTOR);
_Py_ExecutorInit(executor, dependencies);
#ifdef Py_DEBUG
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
int lltrace = 0;
if (python_lltrace != NULL && *python_lltrace >= '0') {
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
}
if (lltrace >= 2) {
printf("Optimized trace (length %d):\n", length);
for (int i = 0; i < length; i++) {
printf("%4d OPTIMIZED: ", i);
_PyUOpPrint(&executor->trace[i]);
printf("\n");
}
}
sanity_check(executor);
#endif
#ifdef _Py_JIT
executor->jit_code = NULL;
executor->jit_side_entry = NULL;
executor->jit_size = 0;
// This is initialized to true so we can prevent the executor
// from being immediately detected as cold and invalidated.
executor->vm_data.warm = true;
if (_PyJIT_Compile(executor, executor->trace, length)) {
Py_DECREF(executor);
return NULL;
}
#endif
_PyObject_GC_TRACK(executor);
return executor;
}
#ifdef Py_STATS
/* Returns the effective trace length.
* Ignores NOPs and trailing exit and error handling.*/
int effective_trace_length(_PyUOpInstruction *buffer, int length)
{
int nop_count = 0;
for (int i = 0; i < length; i++) {
int opcode = buffer[i].opcode;
if (opcode == _NOP) {
nop_count++;
}
if (is_terminator(&buffer[i])) {
return i+1-nop_count;
}
}
Py_FatalError("No terminating instruction");
Py_UNREACHABLE();
}
#endif
static int
uop_optimize(
_PyOptimizerObject *self,
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr,
int curr_stackentries,
bool progress_needed)
{
_PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies);
_PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
OPT_STAT_INC(attempts);
int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies, progress_needed);
if (length <= 0) {
// Error or nothing translated
return length;
}
assert(length < UOP_MAX_TRACE_LENGTH);
OPT_STAT_INC(traces_created);
char *env_var = Py_GETENV("PYTHON_UOPS_OPTIMIZE");
if (env_var == NULL || *env_var == '\0' || *env_var > '0') {
length = _Py_uop_analyze_and_optimize(frame, buffer,
length,
curr_stackentries, &dependencies);
if (length <= 0) {
return length;
}
}
assert(length < UOP_MAX_TRACE_LENGTH);
assert(length >= 1);
/* Fix up */
for (int pc = 0; pc < length; pc++) {
int opcode = buffer[pc].opcode;
int oparg = buffer[pc].oparg;
if (_PyUop_Flags[opcode] & HAS_OPARG_AND_1_FLAG) {
buffer[pc].opcode = opcode + 1 + (oparg & 1);
}
else if (oparg < _PyUop_Replication[opcode]) {
buffer[pc].opcode = opcode + oparg + 1;
}
else if (is_terminator(&buffer[pc])) {
break;
}
assert(_PyOpcode_uop_name[buffer[pc].opcode]);
assert(strncmp(_PyOpcode_uop_name[buffer[pc].opcode], _PyOpcode_uop_name[opcode], strlen(_PyOpcode_uop_name[opcode])) == 0);
}
OPT_HIST(effective_trace_length(buffer, length), optimized_trace_length_hist);
length = prepare_for_execution(buffer, length);
assert(length <= UOP_MAX_TRACE_LENGTH);
_PyExecutorObject *executor = make_executor_from_uops(buffer, length, &dependencies);
if (executor == NULL) {
return -1;
}
assert(length <= UOP_MAX_TRACE_LENGTH);
*exec_ptr = executor;
return 1;
}
static void
uop_opt_dealloc(PyObject *self) {
PyObject_Free(self);
}
PyTypeObject _PyUOpOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "uop_optimizer",
.tp_basicsize = sizeof(_PyOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_dealloc = uop_opt_dealloc,
};
PyObject *
_PyOptimizer_NewUOpOptimizer(void)
{
_PyOptimizerObject *opt = PyObject_New(_PyOptimizerObject, &_PyUOpOptimizer_Type);
if (opt == NULL) {
return NULL;
}
opt->optimize = uop_optimize;
return (PyObject *)opt;
}
static void
counter_dealloc(_PyExecutorObject *self) {
/* The optimizer is the operand of the second uop. */
PyObject *opt = (PyObject *)self->trace[1].operand0;
Py_DECREF(opt);
uop_dealloc(self);
}
PyTypeObject _PyCounterExecutor_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "counting_executor",
.tp_basicsize = offsetof(_PyExecutorObject, exits),
.tp_itemsize = 1,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION | Py_TPFLAGS_HAVE_GC,
.tp_dealloc = (destructor)counter_dealloc,
.tp_methods = executor_methods,
.tp_traverse = executor_traverse,
.tp_clear = (inquiry)executor_clear,
};
static int
counter_optimize(
_PyOptimizerObject* self,
_PyInterpreterFrame *frame,
_Py_CODEUNIT *instr,
_PyExecutorObject **exec_ptr,
int Py_UNUSED(curr_stackentries),
bool Py_UNUSED(progress_needed)
)
{
PyCodeObject *code = _PyFrame_GetCode(frame);
int oparg = instr->op.arg;
while (instr->op.code == EXTENDED_ARG) {
instr++;
oparg = (oparg << 8) | instr->op.arg;
}
if (instr->op.code != JUMP_BACKWARD) {
/* Counter optimizer can only handle backward edges */
return 0;
}
_Py_CODEUNIT *target = instr + 1 + _PyOpcode_Caches[JUMP_BACKWARD] - oparg;
_PyUOpInstruction buffer[4] = {
{ .opcode = _START_EXECUTOR, .jump_target = 3, .format=UOP_FORMAT_JUMP },
{ .opcode = _LOAD_CONST_INLINE, .operand0 = (uintptr_t)self },
{ .opcode = _INTERNAL_INCREMENT_OPT_COUNTER },
{ .opcode = _EXIT_TRACE, .target = (uint32_t)(target - _PyCode_CODE(code)), .format=UOP_FORMAT_TARGET }
};
_PyExecutorObject *executor = make_executor_from_uops(buffer, 4, &EMPTY_FILTER);
if (executor == NULL) {
return -1;
}
Py_INCREF(self);
Py_SET_TYPE(executor, &_PyCounterExecutor_Type);
*exec_ptr = executor;
return 1;
}
static PyObject *
counter_get_counter(PyObject *self, PyObject *args)
{
return PyLong_FromLongLong(((_PyCounterOptimizerObject *)self)->count);
}
static PyMethodDef counter_optimizer_methods[] = {
{ "get_count", counter_get_counter, METH_NOARGS, NULL },
{ NULL, NULL },
};
PyTypeObject _PyCounterOptimizer_Type = {
PyVarObject_HEAD_INIT(&PyType_Type, 0)
.tp_name = "Counter optimizer",
.tp_basicsize = sizeof(_PyCounterOptimizerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION,
.tp_methods = counter_optimizer_methods,
.tp_dealloc = (destructor)PyObject_Free,
};
PyObject *
_PyOptimizer_NewCounter(void)
{
_PyCounterOptimizerObject *opt = (_PyCounterOptimizerObject *)_PyObject_New(&_PyCounterOptimizer_Type);
if (opt == NULL) {
return NULL;
}
opt->base.optimize = counter_optimize;
opt->count = 0;
return (PyObject *)opt;
}
/*****************************************
* Executor management
****************************************/
/* We use a bloomfilter with k = 6, m = 256
* The choice of k and the following constants
* could do with a more rigorous analysis,
* but here is a simple analysis:
*
* We want to keep the false positive rate low.
* For n = 5 (a trace depends on 5 objects),
* we expect 30 bits set, giving a false positive
* rate of (30/256)**6 == 2.5e-6 which is plenty
* good enough.
*
* However with n = 10 we expect 60 bits set (worst case),
* giving a false positive of (60/256)**6 == 0.0001
*
* We choose k = 6, rather than a higher number as
* it means the false positive rate grows slower for high n.
*
* n = 5, k = 6 => fp = 2.6e-6
* n = 5, k = 8 => fp = 3.5e-7
* n = 10, k = 6 => fp = 1.6e-4
* n = 10, k = 8 => fp = 0.9e-4
* n = 15, k = 6 => fp = 0.18%
* n = 15, k = 8 => fp = 0.23%
* n = 20, k = 6 => fp = 1.1%
* n = 20, k = 8 => fp = 2.3%
*
* The above analysis assumes perfect hash functions,
* but those don't exist, so the real false positive
* rates may be worse.
*/
#define K 6
#define SEED 20221211
/* TO DO -- Use more modern hash functions with better distribution of bits */
static uint64_t
address_to_hash(void *ptr) {
assert(ptr != NULL);
uint64_t uhash = SEED;
uintptr_t addr = (uintptr_t)ptr;
for (int i = 0; i < SIZEOF_VOID_P; i++) {
uhash ^= addr & 255;
uhash *= (uint64_t)PyHASH_MULTIPLIER;
addr >>= 8;
}
return uhash;
}
void
_Py_BloomFilter_Init(_PyBloomFilter *bloom)
{
for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
bloom->bits[i] = 0;
}
}
/* We want K hash functions that each set 1 bit.
* A hash function that sets 1 bit in M bits can be trivially
* derived from a log2(M) bit hash function.
* So we extract 8 (log2(256)) bits at a time from
* the 64bit hash. */
void
_Py_BloomFilter_Add(_PyBloomFilter *bloom, void *ptr)
{
uint64_t hash = address_to_hash(ptr);
assert(K <= 8);
for (int i = 0; i < K; i++) {
uint8_t bits = hash & 255;
bloom->bits[bits >> 5] |= (1 << (bits&31));
hash >>= 8;
}
}
static bool
bloom_filter_may_contain(_PyBloomFilter *bloom, _PyBloomFilter *hashes)
{
for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
if ((bloom->bits[i] & hashes->bits[i]) != hashes->bits[i]) {
return false;
}
}
return true;
}
static void
link_executor(_PyExecutorObject *executor)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyExecutorLinkListNode *links = &executor->vm_data.links;
_PyExecutorObject *head = interp->executor_list_head;
if (head == NULL) {
interp->executor_list_head = executor;
links->previous = NULL;
links->next = NULL;
}
else {
assert(head->vm_data.links.previous == NULL);
links->previous = NULL;
links->next = head;
head->vm_data.links.previous = executor;
interp->executor_list_head = executor;
}
executor->vm_data.linked = true;
/* executor_list_head must be first in list */
assert(interp->executor_list_head->vm_data.links.previous == NULL);
}
static void
unlink_executor(_PyExecutorObject *executor)
{
if (!executor->vm_data.linked) {
return;
}
_PyExecutorLinkListNode *links = &executor->vm_data.links;
assert(executor->vm_data.valid);
_PyExecutorObject *next = links->next;
_PyExecutorObject *prev = links->previous;
if (next != NULL) {
next->vm_data.links.previous = prev;
}
if (prev != NULL) {
prev->vm_data.links.next = next;
}
else {
// prev == NULL implies that executor is the list head
PyInterpreterState *interp = PyInterpreterState_Get();
assert(interp->executor_list_head == executor);
interp->executor_list_head = next;
}
executor->vm_data.linked = false;
}
/* This must be called by optimizers before using the executor */
void
_Py_ExecutorInit(_PyExecutorObject *executor, const _PyBloomFilter *dependency_set)
{
executor->vm_data.valid = true;
for (int i = 0; i < _Py_BLOOM_FILTER_WORDS; i++) {
executor->vm_data.bloom.bits[i] = dependency_set->bits[i];
}
link_executor(executor);
}
/* Detaches the executor from the code object (if any) that
* holds a reference to it */
void
_Py_ExecutorDetach(_PyExecutorObject *executor)
{
PyCodeObject *code = executor->vm_data.code;
if (code == NULL) {
return;
}
_Py_CODEUNIT *instruction = &_PyCode_CODE(code)[executor->vm_data.index];
assert(instruction->op.code == ENTER_EXECUTOR);
int index = instruction->op.arg;
assert(code->co_executors->executors[index] == executor);
instruction->op.code = executor->vm_data.opcode;
instruction->op.arg = executor->vm_data.oparg;
executor->vm_data.code = NULL;
code->co_executors->executors[index] = NULL;
Py_DECREF(executor);
}
static int
executor_clear(_PyExecutorObject *executor)
{
if (!executor->vm_data.valid) {
return 0;
}
assert(executor->vm_data.valid == 1);
unlink_executor(executor);
executor->vm_data.valid = 0;
/* It is possible for an executor to form a reference
* cycle with itself, so decref'ing a side exit could
* free the executor unless we hold a strong reference to it
*/
Py_INCREF(executor);
for (uint32_t i = 0; i < executor->exit_count; i++) {
executor->exits[i].temperature = initial_unreachable_backoff_counter();
Py_CLEAR(executor->exits[i].executor);
}
_Py_ExecutorDetach(executor);
Py_DECREF(executor);
return 0;
}
void
_Py_Executor_DependsOn(_PyExecutorObject *executor, void *obj)
{
assert(executor->vm_data.valid);
_Py_BloomFilter_Add(&executor->vm_data.bloom, obj);
}
/* Invalidate all executors that depend on `obj`
* May cause other executors to be invalidated as well
*/
void
_Py_Executors_InvalidateDependency(PyInterpreterState *interp, void *obj, int is_invalidation)
{
_PyBloomFilter obj_filter;
_Py_BloomFilter_Init(&obj_filter);
_Py_BloomFilter_Add(&obj_filter, obj);
/* Walk the list of executors */
/* TO DO -- Use a tree to avoid traversing as many objects */
PyObject *invalidate = PyList_New(0);
if (invalidate == NULL) {
goto error;
}
/* Clearing an executor can deallocate others, so we need to make a list of
* executors to invalidate first */
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
assert(exec->vm_data.valid);
_PyExecutorObject *next = exec->vm_data.links.next;
if (bloom_filter_may_contain(&exec->vm_data.bloom, &obj_filter) &&
PyList_Append(invalidate, (PyObject *)exec))
{
goto error;
}
exec = next;
}
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
_PyExecutorObject *exec = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i);
executor_clear(exec);
if (is_invalidation) {
OPT_STAT_INC(executors_invalidated);
}
}
Py_DECREF(invalidate);
return;
error:
PyErr_Clear();
Py_XDECREF(invalidate);
// If we're truly out of memory, wiping out everything is a fine fallback:
_Py_Executors_InvalidateAll(interp, is_invalidation);
}
/* Invalidate all executors */
void
_Py_Executors_InvalidateAll(PyInterpreterState *interp, int is_invalidation)
{
while (interp->executor_list_head) {
_PyExecutorObject *executor = interp->executor_list_head;
assert(executor->vm_data.valid == 1 && executor->vm_data.linked == 1);
if (executor->vm_data.code) {
// Clear the entire code object so its co_executors array be freed:
_PyCode_Clear_Executors(executor->vm_data.code);
}
else {
executor_clear(executor);
}
if (is_invalidation) {
OPT_STAT_INC(executors_invalidated);
}
}
}
void
_Py_Executors_InvalidateCold(PyInterpreterState *interp)
{
/* Walk the list of executors */
/* TO DO -- Use a tree to avoid traversing as many objects */
PyObject *invalidate = PyList_New(0);
if (invalidate == NULL) {
goto error;
}
/* Clearing an executor can deallocate others, so we need to make a list of
* executors to invalidate first */
for (_PyExecutorObject *exec = interp->executor_list_head; exec != NULL;) {
assert(exec->vm_data.valid);
_PyExecutorObject *next = exec->vm_data.links.next;
if (!exec->vm_data.warm && PyList_Append(invalidate, (PyObject *)exec) < 0) {
goto error;
}
else {
exec->vm_data.warm = false;
}
exec = next;
}
for (Py_ssize_t i = 0; i < PyList_GET_SIZE(invalidate); i++) {
_PyExecutorObject *exec = (_PyExecutorObject *)PyList_GET_ITEM(invalidate, i);
executor_clear(exec);
}
Py_DECREF(invalidate);
return;
error:
PyErr_Clear();
Py_XDECREF(invalidate);
// If we're truly out of memory, wiping out everything is a fine fallback
_Py_Executors_InvalidateAll(interp, 0);
}
#endif /* _Py_TIER2 */