GH-111646: Simplify optimizer, by compacting uops when making executor. (GH-111647)

This commit is contained in:
Mark Shannon 2023-11-06 11:28:52 +00:00 committed by GitHub
parent c8faa3568a
commit d78c872e0d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 117 deletions

View file

@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
.tp_methods = executor_methods, .tp_methods = executor_methods,
}; };
static int
move_stubs(
_PyUOpInstruction *trace,
int trace_length,
int stubs_start,
int stubs_end
)
{
memmove(trace + trace_length,
trace + stubs_start,
(stubs_end - stubs_start) * sizeof(_PyUOpInstruction));
// Patch up the jump targets
for (int i = 0; i < trace_length; i++) {
if (trace[i].opcode == _POP_JUMP_IF_FALSE ||
trace[i].opcode == _POP_JUMP_IF_TRUE)
{
int target = trace[i].oparg;
if (target >= stubs_start) {
target += trace_length - stubs_start;
trace[i].oparg = target;
}
}
}
return trace_length + stubs_end - stubs_start;
}
#define TRACE_STACK_SIZE 5 #define TRACE_STACK_SIZE 5
/* Returns 1 on success,
* 0 if it failed to produce a worthwhile trace,
* and -1 on an error.
*/
static int static int
translate_bytecode_to_trace( translate_bytecode_to_trace(
PyCodeObject *code, PyCodeObject *code,
@ -790,7 +768,7 @@ done:
} }
assert(code == initial_code); assert(code == initial_code);
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE // Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
if (trace_length > 3) { if (trace_length > 4) {
ADD_TO_TRACE(_EXIT_TRACE, 0, 0); ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
DPRINTF(1, DPRINTF(1,
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n", "Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
@ -800,25 +778,8 @@ done:
2 * INSTR_IP(initial_instr, code), 2 * INSTR_IP(initial_instr, code),
trace_length, trace_length,
buffer_size - max_length); buffer_size - max_length);
if (max_length < buffer_size) { OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist);
// There are stubs return 1;
if (trace_length < max_length) {
// There's a gap before the stubs
// Move the stubs back to be immediately after the main trace
// (which ends at trace_length)
DPRINTF(2,
"Moving %d stub uops back by %d\n",
buffer_size - max_length,
max_length - trace_length);
trace_length = move_stubs(trace, trace_length, max_length, buffer_size);
}
else {
assert(trace_length == max_length);
// There's no gap
trace_length = buffer_size;
}
}
return trace_length;
} }
else { else {
OPT_STAT_INC(trace_too_short); OPT_STAT_INC(trace_too_short);
@ -838,70 +799,84 @@ done:
#undef DPRINTF #undef DPRINTF
} }
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
/* Count the number of used uops, and mark them in the bit vector `used`.
* This can be done in a single pass using simple reachability analysis,
* as there are no backward jumps.
* NOPs are excluded from the count.
*/
static int static int
remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length) compute_used(_PyUOpInstruction *buffer, uint32_t *used)
{ {
// Stage 1: Replace unneeded _SET_IP uops with NOP. int count = 0;
// Note that we don't enter stubs, those SET_IPs are needed. SET_BIT(used, 0);
int last_set_ip = -1; for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) {
int last_instr = 0; if (!BIT_IS_SET(used, i)) {
bool need_ip = true; continue;
for (int pc = 0; pc < trace_length; pc++) {
int opcode = trace[pc].opcode;
if (opcode == _SET_IP) {
if (!need_ip && last_set_ip >= 0) {
trace[last_set_ip].opcode = NOP;
} }
need_ip = false; count++;
last_set_ip = pc; int opcode = buffer[i].opcode;
if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
continue;
} }
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) { /* All other micro-ops fall through, so i+1 is reachable */
last_instr = pc + 1; SET_BIT(used, i+1);
switch(opcode) {
case NOP:
/* Don't count NOPs as used */
count--;
UNSET_BIT(used, i);
break; break;
} case _POP_JUMP_IF_FALSE:
else { case _POP_JUMP_IF_TRUE:
// If opcode has ERROR or DEOPT, set need_ip to true /* Mark target as reachable */
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) { SET_BIT(used, buffer[i].oparg);
need_ip = true;
} }
} }
return count;
} }
// Stage 2: Squash NOP opcodes (pre-existing or set above).
int dest = 0; /* Makes an executor from a buffer of uops.
for (int pc = 0; pc < last_instr; pc++) { * Account for the buffer having gaps and NOPs by computing a "used"
int opcode = trace[pc].opcode; * bit vector and only copying the used uops. Here "used" means reachable
if (opcode != NOP) { * and not a NOP.
if (pc != dest) { */
trace[dest] = trace[pc]; static _PyExecutorObject *
make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
{
uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
int length = compute_used(buffer, used);
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length);
if (executor == NULL) {
return NULL;
} }
dest++; int dest = length - 1;
/* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
if (!BIT_IS_SET(used, i)) {
continue;
} }
executor->trace[dest] = buffer[i];
int opcode = buffer[i].opcode;
if (opcode == _POP_JUMP_IF_FALSE ||
opcode == _POP_JUMP_IF_TRUE)
{
/* The oparg of the target will already have been set to its new offset */
int oparg = executor->trace[dest].oparg;
executor->trace[dest].oparg = buffer[oparg].oparg;
} }
// Stage 3: Move the stubs back. /* Set the oparg to be the destination offset,
if (dest < last_instr) { * so that we can set the oparg of earlier jumps correctly. */
int new_trace_length = move_stubs(trace, dest, last_instr, trace_length); buffer[i].oparg = dest;
#ifdef Py_DEBUG dest--;
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
int lltrace = 0;
if (python_lltrace != NULL && *python_lltrace >= '0') {
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
} }
if (lltrace >= 2) { assert(dest == -1);
printf("Optimized trace (length %d+%d = %d, saved %d):\n", executor->base.execute = _PyUopExecute;
dest, trace_length - last_instr, new_trace_length, _Py_ExecutorInit((_PyExecutorObject *)executor, dependencies);
trace_length - new_trace_length); return (_PyExecutorObject *)executor;
for (int pc = 0; pc < new_trace_length; pc++) {
printf("%4d: (%s, %d, %" PRIu64 ")\n",
pc,
uop_name(trace[pc].opcode),
(trace[pc].oparg),
(uint64_t)(trace[pc].operand));
}
}
#endif
trace_length = new_trace_length;
}
return trace_length;
} }
static int static int
@ -914,28 +889,26 @@ uop_optimize(
{ {
_PyBloomFilter dependencies; _PyBloomFilter dependencies;
_Py_BloomFilter_Init(&dependencies); _Py_BloomFilter_Init(&dependencies);
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH]; _PyUOpInstruction buffer[_Py_UOP_MAX_TRACE_LENGTH];
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies); int err = translate_bytecode_to_trace(code, instr, buffer, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
if (trace_length <= 0) { if (err <= 0) {
// Error or nothing translated // Error or nothing translated
return trace_length; return err;
} }
OPT_HIST(trace_length, trace_length_hist);
OPT_STAT_INC(traces_created); OPT_STAT_INC(traces_created);
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
if (uop_optimize != NULL && *uop_optimize > '0') { if (uop_optimize == NULL || *uop_optimize > '0') {
trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries); err = _Py_uop_analyze_and_optimize(code, buffer, _Py_UOP_MAX_TRACE_LENGTH, curr_stackentries);
if (err < 0) {
return -1;
} }
trace_length = remove_unneeded_uops(trace, trace_length); }
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length); _PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies);
if (executor == NULL) { if (executor == NULL) {
return -1; return -1;
} }
OPT_HIST(trace_length, optimized_trace_length_hist); OPT_HIST(Py_SIZE(executor), optimized_trace_length_hist);
executor->base.execute = _PyUopExecute; *exec_ptr = executor;
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
_Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
*exec_ptr = (_PyExecutorObject *)executor;
return 1; return 1;
} }

View file

@ -13,13 +13,42 @@
#include "pycore_optimizer.h" #include "pycore_optimizer.h"
static void
remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
{
// Note that we don't enter stubs, those SET_IPs are needed.
int last_set_ip = -1;
bool need_ip = true;
for (int pc = 0; pc < buffer_size; pc++) {
int opcode = buffer[pc].opcode;
if (opcode == _SET_IP) {
if (!need_ip && last_set_ip >= 0) {
buffer[last_set_ip].opcode = NOP;
}
need_ip = false;
last_set_ip = pc;
}
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
break;
}
else {
// If opcode has ERROR or DEOPT, set need_ip to true
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
need_ip = true;
}
}
}
}
int int
_Py_uop_analyze_and_optimize( _Py_uop_analyze_and_optimize(
PyCodeObject *co, PyCodeObject *co,
_PyUOpInstruction *trace, _PyUOpInstruction *buffer,
int trace_len, int buffer_size,
int curr_stacklen int curr_stacklen
) )
{ {
return trace_len; remove_unneeded_uops(buffer, buffer_size);
return 0;
} }