mirror of
https://github.com/python/cpython.git
synced 2025-10-14 10:53:40 +00:00
GH-111646: Simplify optimizer, by compacting uops when making executor. (GH-111647)
This commit is contained in:
parent
c8faa3568a
commit
d78c872e0d
2 changed files with 119 additions and 117 deletions
|
@ -384,34 +384,12 @@ PyTypeObject _PyUOpExecutor_Type = {
|
||||||
.tp_methods = executor_methods,
|
.tp_methods = executor_methods,
|
||||||
};
|
};
|
||||||
|
|
||||||
static int
|
|
||||||
move_stubs(
|
|
||||||
_PyUOpInstruction *trace,
|
|
||||||
int trace_length,
|
|
||||||
int stubs_start,
|
|
||||||
int stubs_end
|
|
||||||
)
|
|
||||||
{
|
|
||||||
memmove(trace + trace_length,
|
|
||||||
trace + stubs_start,
|
|
||||||
(stubs_end - stubs_start) * sizeof(_PyUOpInstruction));
|
|
||||||
// Patch up the jump targets
|
|
||||||
for (int i = 0; i < trace_length; i++) {
|
|
||||||
if (trace[i].opcode == _POP_JUMP_IF_FALSE ||
|
|
||||||
trace[i].opcode == _POP_JUMP_IF_TRUE)
|
|
||||||
{
|
|
||||||
int target = trace[i].oparg;
|
|
||||||
if (target >= stubs_start) {
|
|
||||||
target += trace_length - stubs_start;
|
|
||||||
trace[i].oparg = target;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return trace_length + stubs_end - stubs_start;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define TRACE_STACK_SIZE 5
|
#define TRACE_STACK_SIZE 5
|
||||||
|
|
||||||
|
/* Returns 1 on success,
|
||||||
|
* 0 if it failed to produce a worthwhile trace,
|
||||||
|
* and -1 on an error.
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
translate_bytecode_to_trace(
|
translate_bytecode_to_trace(
|
||||||
PyCodeObject *code,
|
PyCodeObject *code,
|
||||||
|
@ -790,7 +768,7 @@ done:
|
||||||
}
|
}
|
||||||
assert(code == initial_code);
|
assert(code == initial_code);
|
||||||
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
|
// Skip short traces like _SET_IP, LOAD_FAST, _SET_IP, _EXIT_TRACE
|
||||||
if (trace_length > 3) {
|
if (trace_length > 4) {
|
||||||
ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
|
ADD_TO_TRACE(_EXIT_TRACE, 0, 0);
|
||||||
DPRINTF(1,
|
DPRINTF(1,
|
||||||
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
|
"Created a trace for %s (%s:%d) at byte offset %d -- length %d+%d\n",
|
||||||
|
@ -800,25 +778,8 @@ done:
|
||||||
2 * INSTR_IP(initial_instr, code),
|
2 * INSTR_IP(initial_instr, code),
|
||||||
trace_length,
|
trace_length,
|
||||||
buffer_size - max_length);
|
buffer_size - max_length);
|
||||||
if (max_length < buffer_size) {
|
OPT_HIST(trace_length + buffer_size - max_length, trace_length_hist);
|
||||||
// There are stubs
|
return 1;
|
||||||
if (trace_length < max_length) {
|
|
||||||
// There's a gap before the stubs
|
|
||||||
// Move the stubs back to be immediately after the main trace
|
|
||||||
// (which ends at trace_length)
|
|
||||||
DPRINTF(2,
|
|
||||||
"Moving %d stub uops back by %d\n",
|
|
||||||
buffer_size - max_length,
|
|
||||||
max_length - trace_length);
|
|
||||||
trace_length = move_stubs(trace, trace_length, max_length, buffer_size);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
assert(trace_length == max_length);
|
|
||||||
// There's no gap
|
|
||||||
trace_length = buffer_size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return trace_length;
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
OPT_STAT_INC(trace_too_short);
|
OPT_STAT_INC(trace_too_short);
|
||||||
|
@ -838,70 +799,84 @@ done:
|
||||||
#undef DPRINTF
|
#undef DPRINTF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define UNSET_BIT(array, bit) (array[(bit)>>5] &= ~(1<<((bit)&31)))
|
||||||
|
#define SET_BIT(array, bit) (array[(bit)>>5] |= (1<<((bit)&31)))
|
||||||
|
#define BIT_IS_SET(array, bit) (array[(bit)>>5] & (1<<((bit)&31)))
|
||||||
|
|
||||||
|
/* Count the number of used uops, and mark them in the bit vector `used`.
|
||||||
|
* This can be done in a single pass using simple reachability analysis,
|
||||||
|
* as there are no backward jumps.
|
||||||
|
* NOPs are excluded from the count.
|
||||||
|
*/
|
||||||
static int
|
static int
|
||||||
remove_unneeded_uops(_PyUOpInstruction *trace, int trace_length)
|
compute_used(_PyUOpInstruction *buffer, uint32_t *used)
|
||||||
{
|
{
|
||||||
// Stage 1: Replace unneeded _SET_IP uops with NOP.
|
int count = 0;
|
||||||
// Note that we don't enter stubs, those SET_IPs are needed.
|
SET_BIT(used, 0);
|
||||||
int last_set_ip = -1;
|
for (int i = 0; i < _Py_UOP_MAX_TRACE_LENGTH; i++) {
|
||||||
int last_instr = 0;
|
if (!BIT_IS_SET(used, i)) {
|
||||||
bool need_ip = true;
|
continue;
|
||||||
for (int pc = 0; pc < trace_length; pc++) {
|
|
||||||
int opcode = trace[pc].opcode;
|
|
||||||
if (opcode == _SET_IP) {
|
|
||||||
if (!need_ip && last_set_ip >= 0) {
|
|
||||||
trace[last_set_ip].opcode = NOP;
|
|
||||||
}
|
}
|
||||||
need_ip = false;
|
count++;
|
||||||
last_set_ip = pc;
|
int opcode = buffer[i].opcode;
|
||||||
|
if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
|
/* All other micro-ops fall through, so i+1 is reachable */
|
||||||
last_instr = pc + 1;
|
SET_BIT(used, i+1);
|
||||||
|
switch(opcode) {
|
||||||
|
case NOP:
|
||||||
|
/* Don't count NOPs as used */
|
||||||
|
count--;
|
||||||
|
UNSET_BIT(used, i);
|
||||||
break;
|
break;
|
||||||
}
|
case _POP_JUMP_IF_FALSE:
|
||||||
else {
|
case _POP_JUMP_IF_TRUE:
|
||||||
// If opcode has ERROR or DEOPT, set need_ip to true
|
/* Mark target as reachable */
|
||||||
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
|
SET_BIT(used, buffer[i].oparg);
|
||||||
need_ip = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return count;
|
||||||
}
|
}
|
||||||
// Stage 2: Squash NOP opcodes (pre-existing or set above).
|
|
||||||
int dest = 0;
|
/* Makes an executor from a buffer of uops.
|
||||||
for (int pc = 0; pc < last_instr; pc++) {
|
* Account for the buffer having gaps and NOPs by computing a "used"
|
||||||
int opcode = trace[pc].opcode;
|
* bit vector and only copying the used uops. Here "used" means reachable
|
||||||
if (opcode != NOP) {
|
* and not a NOP.
|
||||||
if (pc != dest) {
|
*/
|
||||||
trace[dest] = trace[pc];
|
static _PyExecutorObject *
|
||||||
|
make_executor_from_uops(_PyUOpInstruction *buffer, _PyBloomFilter *dependencies)
|
||||||
|
{
|
||||||
|
uint32_t used[(_Py_UOP_MAX_TRACE_LENGTH + 31)/32] = { 0 };
|
||||||
|
int length = compute_used(buffer, used);
|
||||||
|
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, length);
|
||||||
|
if (executor == NULL) {
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
dest++;
|
int dest = length - 1;
|
||||||
|
/* Scan backwards, so that we see the destinations of jumps before the jumps themselves. */
|
||||||
|
for (int i = _Py_UOP_MAX_TRACE_LENGTH-1; i >= 0; i--) {
|
||||||
|
if (!BIT_IS_SET(used, i)) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
executor->trace[dest] = buffer[i];
|
||||||
|
int opcode = buffer[i].opcode;
|
||||||
|
if (opcode == _POP_JUMP_IF_FALSE ||
|
||||||
|
opcode == _POP_JUMP_IF_TRUE)
|
||||||
|
{
|
||||||
|
/* The oparg of the target will already have been set to its new offset */
|
||||||
|
int oparg = executor->trace[dest].oparg;
|
||||||
|
executor->trace[dest].oparg = buffer[oparg].oparg;
|
||||||
}
|
}
|
||||||
// Stage 3: Move the stubs back.
|
/* Set the oparg to be the destination offset,
|
||||||
if (dest < last_instr) {
|
* so that we can set the oparg of earlier jumps correctly. */
|
||||||
int new_trace_length = move_stubs(trace, dest, last_instr, trace_length);
|
buffer[i].oparg = dest;
|
||||||
#ifdef Py_DEBUG
|
dest--;
|
||||||
char *python_lltrace = Py_GETENV("PYTHON_LLTRACE");
|
|
||||||
int lltrace = 0;
|
|
||||||
if (python_lltrace != NULL && *python_lltrace >= '0') {
|
|
||||||
lltrace = *python_lltrace - '0'; // TODO: Parse an int and all that
|
|
||||||
}
|
}
|
||||||
if (lltrace >= 2) {
|
assert(dest == -1);
|
||||||
printf("Optimized trace (length %d+%d = %d, saved %d):\n",
|
executor->base.execute = _PyUopExecute;
|
||||||
dest, trace_length - last_instr, new_trace_length,
|
_Py_ExecutorInit((_PyExecutorObject *)executor, dependencies);
|
||||||
trace_length - new_trace_length);
|
return (_PyExecutorObject *)executor;
|
||||||
for (int pc = 0; pc < new_trace_length; pc++) {
|
|
||||||
printf("%4d: (%s, %d, %" PRIu64 ")\n",
|
|
||||||
pc,
|
|
||||||
uop_name(trace[pc].opcode),
|
|
||||||
(trace[pc].oparg),
|
|
||||||
(uint64_t)(trace[pc].operand));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
trace_length = new_trace_length;
|
|
||||||
}
|
|
||||||
return trace_length;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -914,28 +889,26 @@ uop_optimize(
|
||||||
{
|
{
|
||||||
_PyBloomFilter dependencies;
|
_PyBloomFilter dependencies;
|
||||||
_Py_BloomFilter_Init(&dependencies);
|
_Py_BloomFilter_Init(&dependencies);
|
||||||
_PyUOpInstruction trace[_Py_UOP_MAX_TRACE_LENGTH];
|
_PyUOpInstruction buffer[_Py_UOP_MAX_TRACE_LENGTH];
|
||||||
int trace_length = translate_bytecode_to_trace(code, instr, trace, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
|
int err = translate_bytecode_to_trace(code, instr, buffer, _Py_UOP_MAX_TRACE_LENGTH, &dependencies);
|
||||||
if (trace_length <= 0) {
|
if (err <= 0) {
|
||||||
// Error or nothing translated
|
// Error or nothing translated
|
||||||
return trace_length;
|
return err;
|
||||||
}
|
}
|
||||||
OPT_HIST(trace_length, trace_length_hist);
|
|
||||||
OPT_STAT_INC(traces_created);
|
OPT_STAT_INC(traces_created);
|
||||||
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
|
char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
|
||||||
if (uop_optimize != NULL && *uop_optimize > '0') {
|
if (uop_optimize == NULL || *uop_optimize > '0') {
|
||||||
trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
|
err = _Py_uop_analyze_and_optimize(code, buffer, _Py_UOP_MAX_TRACE_LENGTH, curr_stackentries);
|
||||||
|
if (err < 0) {
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
trace_length = remove_unneeded_uops(trace, trace_length);
|
}
|
||||||
_PyUOpExecutorObject *executor = PyObject_NewVar(_PyUOpExecutorObject, &_PyUOpExecutor_Type, trace_length);
|
_PyExecutorObject *executor = make_executor_from_uops(buffer, &dependencies);
|
||||||
if (executor == NULL) {
|
if (executor == NULL) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
OPT_HIST(trace_length, optimized_trace_length_hist);
|
OPT_HIST(Py_SIZE(executor), optimized_trace_length_hist);
|
||||||
executor->base.execute = _PyUopExecute;
|
*exec_ptr = executor;
|
||||||
memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
|
|
||||||
_Py_ExecutorInit((_PyExecutorObject *)executor, &dependencies);
|
|
||||||
*exec_ptr = (_PyExecutorObject *)executor;
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,13 +13,42 @@
|
||||||
#include "pycore_optimizer.h"
|
#include "pycore_optimizer.h"
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size)
|
||||||
|
{
|
||||||
|
// Note that we don't enter stubs, those SET_IPs are needed.
|
||||||
|
int last_set_ip = -1;
|
||||||
|
bool need_ip = true;
|
||||||
|
for (int pc = 0; pc < buffer_size; pc++) {
|
||||||
|
int opcode = buffer[pc].opcode;
|
||||||
|
if (opcode == _SET_IP) {
|
||||||
|
if (!need_ip && last_set_ip >= 0) {
|
||||||
|
buffer[last_set_ip].opcode = NOP;
|
||||||
|
}
|
||||||
|
need_ip = false;
|
||||||
|
last_set_ip = pc;
|
||||||
|
}
|
||||||
|
else if (opcode == _JUMP_TO_TOP || opcode == _EXIT_TRACE) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// If opcode has ERROR or DEOPT, set need_ip to true
|
||||||
|
if (_PyOpcode_opcode_metadata[opcode].flags & (HAS_ERROR_FLAG | HAS_DEOPT_FLAG) || opcode == _PUSH_FRAME) {
|
||||||
|
need_ip = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
_Py_uop_analyze_and_optimize(
|
_Py_uop_analyze_and_optimize(
|
||||||
PyCodeObject *co,
|
PyCodeObject *co,
|
||||||
_PyUOpInstruction *trace,
|
_PyUOpInstruction *buffer,
|
||||||
int trace_len,
|
int buffer_size,
|
||||||
int curr_stacklen
|
int curr_stacklen
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
return trace_len;
|
remove_unneeded_uops(buffer, buffer_size);
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue