gh-104909: Split BINARY_OP into micro-ops (#104910)

Co-authored-by: Brandt Bucher <brandtbucher@gmail.com>
This commit is contained in:
Guido van Rossum 2023-05-31 08:09:23 -07:00 committed by GitHub
parent fbc9d0dbb2
commit df396b59af
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 811 additions and 657 deletions

View file

@ -279,57 +279,94 @@ dummy_func(
family(binary_op, INLINE_CACHE_ENTRIES_BINARY_OP) = { family(binary_op, INLINE_CACHE_ENTRIES_BINARY_OP) = {
BINARY_OP, BINARY_OP,
BINARY_OP_ADD_FLOAT,
BINARY_OP_ADD_INT,
BINARY_OP_ADD_UNICODE,
// BINARY_OP_INPLACE_ADD_UNICODE, // This is an odd duck.
BINARY_OP_MULTIPLY_FLOAT,
BINARY_OP_MULTIPLY_INT, BINARY_OP_MULTIPLY_INT,
BINARY_OP_SUBTRACT_FLOAT, BINARY_OP_ADD_INT,
BINARY_OP_SUBTRACT_INT, BINARY_OP_SUBTRACT_INT,
BINARY_OP_MULTIPLY_FLOAT,
BINARY_OP_ADD_FLOAT,
BINARY_OP_SUBTRACT_FLOAT,
BINARY_OP_ADD_UNICODE,
// BINARY_OP_INPLACE_ADD_UNICODE, // See comments at that opcode.
}; };
op(_GUARD_BOTH_INT, (left, right -- left, right)) {
inst(BINARY_OP_MULTIPLY_INT, (unused/1, left, right -- prod)) {
DEOPT_IF(!PyLong_CheckExact(left), BINARY_OP); DEOPT_IF(!PyLong_CheckExact(left), BINARY_OP);
DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP); DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP);
STAT_INC(BINARY_OP, hit);
prod = _PyLong_Multiply((PyLongObject *)left, (PyLongObject *)right);
_Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free);
_Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free);
ERROR_IF(prod == NULL, error);
} }
inst(BINARY_OP_MULTIPLY_FLOAT, (unused/1, left, right -- prod)) { op(_BINARY_OP_MULTIPLY_INT, (unused/1, left, right -- res)) {
STAT_INC(BINARY_OP, hit);
res = _PyLong_Multiply((PyLongObject *)left, (PyLongObject *)right);
_Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free);
_Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free);
ERROR_IF(res == NULL, error);
}
op(_BINARY_OP_ADD_INT, (unused/1, left, right -- res)) {
STAT_INC(BINARY_OP, hit);
res = _PyLong_Add((PyLongObject *)left, (PyLongObject *)right);
_Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free);
_Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free);
ERROR_IF(res == NULL, error);
}
op(_BINARY_OP_SUBTRACT_INT, (unused/1, left, right -- res)) {
STAT_INC(BINARY_OP, hit);
res = _PyLong_Subtract((PyLongObject *)left, (PyLongObject *)right);
_Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free);
_Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free);
ERROR_IF(res == NULL, error);
}
macro(BINARY_OP_MULTIPLY_INT) =
_GUARD_BOTH_INT + _BINARY_OP_MULTIPLY_INT;
macro(BINARY_OP_ADD_INT) =
_GUARD_BOTH_INT + _BINARY_OP_ADD_INT;
macro(BINARY_OP_SUBTRACT_INT) =
_GUARD_BOTH_INT + _BINARY_OP_SUBTRACT_INT;
op(_GUARD_BOTH_FLOAT, (left, right -- left, right)) {
DEOPT_IF(!PyFloat_CheckExact(left), BINARY_OP); DEOPT_IF(!PyFloat_CheckExact(left), BINARY_OP);
DEOPT_IF(!PyFloat_CheckExact(right), BINARY_OP); DEOPT_IF(!PyFloat_CheckExact(right), BINARY_OP);
}
op(_BINARY_OP_MULTIPLY_FLOAT, (unused/1, left, right -- res)) {
STAT_INC(BINARY_OP, hit); STAT_INC(BINARY_OP, hit);
double dprod = ((PyFloatObject *)left)->ob_fval * double dres =
((PyFloatObject *)left)->ob_fval *
((PyFloatObject *)right)->ob_fval; ((PyFloatObject *)right)->ob_fval;
DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dprod, prod); DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dres, res);
} }
inst(BINARY_OP_SUBTRACT_INT, (unused/1, left, right -- sub)) { op(_BINARY_OP_ADD_FLOAT, (unused/1, left, right -- res)) {
DEOPT_IF(!PyLong_CheckExact(left), BINARY_OP);
DEOPT_IF(!PyLong_CheckExact(right), BINARY_OP);
STAT_INC(BINARY_OP, hit); STAT_INC(BINARY_OP, hit);
sub = _PyLong_Subtract((PyLongObject *)left, (PyLongObject *)right); double dres =
_Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free); ((PyFloatObject *)left)->ob_fval +
_Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free); ((PyFloatObject *)right)->ob_fval;
ERROR_IF(sub == NULL, error); DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dres, res);
} }
inst(BINARY_OP_SUBTRACT_FLOAT, (unused/1, left, right -- sub)) { op(_BINARY_OP_SUBTRACT_FLOAT, (unused/1, left, right -- res)) {
DEOPT_IF(!PyFloat_CheckExact(left), BINARY_OP);
DEOPT_IF(!PyFloat_CheckExact(right), BINARY_OP);
STAT_INC(BINARY_OP, hit); STAT_INC(BINARY_OP, hit);
double dsub = ((PyFloatObject *)left)->ob_fval - ((PyFloatObject *)right)->ob_fval; double dres =
DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dsub, sub); ((PyFloatObject *)left)->ob_fval -
((PyFloatObject *)right)->ob_fval;
DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dres, res);
} }
inst(BINARY_OP_ADD_UNICODE, (unused/1, left, right -- res)) { macro(BINARY_OP_MULTIPLY_FLOAT) =
_GUARD_BOTH_FLOAT + _BINARY_OP_MULTIPLY_FLOAT;
macro(BINARY_OP_ADD_FLOAT) =
_GUARD_BOTH_FLOAT + _BINARY_OP_ADD_FLOAT;
macro(BINARY_OP_SUBTRACT_FLOAT) =
_GUARD_BOTH_FLOAT + _BINARY_OP_SUBTRACT_FLOAT;
op(_GUARD_BOTH_UNICODE, (left, right -- left, right)) {
DEOPT_IF(!PyUnicode_CheckExact(left), BINARY_OP); DEOPT_IF(!PyUnicode_CheckExact(left), BINARY_OP);
DEOPT_IF(Py_TYPE(right) != Py_TYPE(left), BINARY_OP); DEOPT_IF(!PyUnicode_CheckExact(right), BINARY_OP);
}
op(_BINARY_OP_ADD_UNICODE, (unused/1, left, right -- res)) {
STAT_INC(BINARY_OP, hit); STAT_INC(BINARY_OP, hit);
res = PyUnicode_Concat(left, right); res = PyUnicode_Concat(left, right);
_Py_DECREF_SPECIALIZED(left, _PyUnicode_ExactDealloc); _Py_DECREF_SPECIALIZED(left, _PyUnicode_ExactDealloc);
@ -337,15 +374,16 @@ dummy_func(
ERROR_IF(res == NULL, error); ERROR_IF(res == NULL, error);
} }
macro(BINARY_OP_ADD_UNICODE) =
_GUARD_BOTH_UNICODE + _BINARY_OP_ADD_UNICODE;
// This is a subtle one. It's a super-instruction for // This is a subtle one. It's a super-instruction for
// BINARY_OP_ADD_UNICODE followed by STORE_FAST // BINARY_OP_ADD_UNICODE followed by STORE_FAST
// where the store goes into the left argument. // where the store goes into the left argument.
// So the inputs are the same as for all BINARY_OP // So the inputs are the same as for all BINARY_OP
// specializations, but there is no output. // specializations, but there is no output.
// At the end we just skip over the STORE_FAST. // At the end we just skip over the STORE_FAST.
inst(BINARY_OP_INPLACE_ADD_UNICODE, (left, right --)) { op(_BINARY_OP_INPLACE_ADD_UNICODE, (left, right --)) {
DEOPT_IF(!PyUnicode_CheckExact(left), BINARY_OP);
DEOPT_IF(Py_TYPE(right) != Py_TYPE(left), BINARY_OP);
_Py_CODEUNIT true_next = next_instr[INLINE_CACHE_ENTRIES_BINARY_OP]; _Py_CODEUNIT true_next = next_instr[INLINE_CACHE_ENTRIES_BINARY_OP];
assert(true_next.op.code == STORE_FAST || assert(true_next.op.code == STORE_FAST ||
true_next.op.code == STORE_FAST__LOAD_FAST); true_next.op.code == STORE_FAST__LOAD_FAST);
@ -372,24 +410,8 @@ dummy_func(
JUMPBY(INLINE_CACHE_ENTRIES_BINARY_OP + 1); JUMPBY(INLINE_CACHE_ENTRIES_BINARY_OP + 1);
} }
inst(BINARY_OP_ADD_FLOAT, (unused/1, left, right -- sum)) { macro(BINARY_OP_INPLACE_ADD_UNICODE) =
DEOPT_IF(!PyFloat_CheckExact(left), BINARY_OP); _GUARD_BOTH_UNICODE + _BINARY_OP_INPLACE_ADD_UNICODE;
DEOPT_IF(Py_TYPE(right) != Py_TYPE(left), BINARY_OP);
STAT_INC(BINARY_OP, hit);
double dsum = ((PyFloatObject *)left)->ob_fval +
((PyFloatObject *)right)->ob_fval;
DECREF_INPUTS_AND_REUSE_FLOAT(left, right, dsum, sum);
}
inst(BINARY_OP_ADD_INT, (unused/1, left, right -- sum)) {
DEOPT_IF(!PyLong_CheckExact(left), BINARY_OP);
DEOPT_IF(Py_TYPE(right) != Py_TYPE(left), BINARY_OP);
STAT_INC(BINARY_OP, hit);
sum = _PyLong_Add((PyLongObject *)left, (PyLongObject *)right);
_Py_DECREF_SPECIALIZED(right, (destructor)PyObject_Free);
_Py_DECREF_SPECIALIZED(left, (destructor)PyObject_Free);
ERROR_IF(sum == NULL, error);
}
family(binary_subscr, INLINE_CACHE_ENTRIES_BINARY_SUBSCR) = { family(binary_subscr, INLINE_CACHE_ENTRIES_BINARY_SUBSCR) = {
BINARY_SUBSCR, BINARY_SUBSCR,

File diff suppressed because it is too large Load diff

View file

@ -42,7 +42,7 @@ _PyOpcode_num_popped(int opcode, int oparg, bool jump) {
case PUSH_NULL: case PUSH_NULL:
return 0; return 0;
case END_FOR: case END_FOR:
return 1+1; return 2;
case INSTRUMENTED_END_FOR: case INSTRUMENTED_END_FOR:
return 2; return 2;
case END_SEND: case END_SEND:
@ -57,20 +57,20 @@ _PyOpcode_num_popped(int opcode, int oparg, bool jump) {
return 1; return 1;
case BINARY_OP_MULTIPLY_INT: case BINARY_OP_MULTIPLY_INT:
return 2; return 2;
case BINARY_OP_MULTIPLY_FLOAT: case BINARY_OP_ADD_INT:
return 2; return 2;
case BINARY_OP_SUBTRACT_INT: case BINARY_OP_SUBTRACT_INT:
return 2; return 2;
case BINARY_OP_MULTIPLY_FLOAT:
return 2;
case BINARY_OP_ADD_FLOAT:
return 2;
case BINARY_OP_SUBTRACT_FLOAT: case BINARY_OP_SUBTRACT_FLOAT:
return 2; return 2;
case BINARY_OP_ADD_UNICODE: case BINARY_OP_ADD_UNICODE:
return 2; return 2;
case BINARY_OP_INPLACE_ADD_UNICODE: case BINARY_OP_INPLACE_ADD_UNICODE:
return 2; return 2;
case BINARY_OP_ADD_FLOAT:
return 2;
case BINARY_OP_ADD_INT:
return 2;
case BINARY_SUBSCR: case BINARY_SUBSCR:
return 2; return 2;
case BINARY_SLICE: case BINARY_SLICE:
@ -164,7 +164,7 @@ _PyOpcode_num_popped(int opcode, int oparg, bool jump) {
case LOAD_LOCALS: case LOAD_LOCALS:
return 0; return 0;
case LOAD_NAME: case LOAD_NAME:
return 0+1; return 0;
case LOAD_FROM_DICT_OR_GLOBALS: case LOAD_FROM_DICT_OR_GLOBALS:
return 1; return 1;
case LOAD_GLOBAL: case LOAD_GLOBAL:
@ -438,7 +438,7 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) {
case PUSH_NULL: case PUSH_NULL:
return 1; return 1;
case END_FOR: case END_FOR:
return 0+0; return 0;
case INSTRUMENTED_END_FOR: case INSTRUMENTED_END_FOR:
return 0; return 0;
case END_SEND: case END_SEND:
@ -453,20 +453,20 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) {
return 1; return 1;
case BINARY_OP_MULTIPLY_INT: case BINARY_OP_MULTIPLY_INT:
return 1; return 1;
case BINARY_OP_MULTIPLY_FLOAT: case BINARY_OP_ADD_INT:
return 1; return 1;
case BINARY_OP_SUBTRACT_INT: case BINARY_OP_SUBTRACT_INT:
return 1; return 1;
case BINARY_OP_MULTIPLY_FLOAT:
return 1;
case BINARY_OP_ADD_FLOAT:
return 1;
case BINARY_OP_SUBTRACT_FLOAT: case BINARY_OP_SUBTRACT_FLOAT:
return 1; return 1;
case BINARY_OP_ADD_UNICODE: case BINARY_OP_ADD_UNICODE:
return 1; return 1;
case BINARY_OP_INPLACE_ADD_UNICODE: case BINARY_OP_INPLACE_ADD_UNICODE:
return 0; return 0;
case BINARY_OP_ADD_FLOAT:
return 1;
case BINARY_OP_ADD_INT:
return 1;
case BINARY_SUBSCR: case BINARY_SUBSCR:
return 1; return 1;
case BINARY_SLICE: case BINARY_SLICE:
@ -560,7 +560,7 @@ _PyOpcode_num_pushed(int opcode, int oparg, bool jump) {
case LOAD_LOCALS: case LOAD_LOCALS:
return 1; return 1;
case LOAD_NAME: case LOAD_NAME:
return 1+1; return 1;
case LOAD_FROM_DICT_OR_GLOBALS: case LOAD_FROM_DICT_OR_GLOBALS:
return 1; return 1;
case LOAD_GLOBAL: case LOAD_GLOBAL:
@ -828,14 +828,14 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[256] = {
[UNARY_NEGATIVE] = { true, INSTR_FMT_IX }, [UNARY_NEGATIVE] = { true, INSTR_FMT_IX },
[UNARY_NOT] = { true, INSTR_FMT_IX }, [UNARY_NOT] = { true, INSTR_FMT_IX },
[UNARY_INVERT] = { true, INSTR_FMT_IX }, [UNARY_INVERT] = { true, INSTR_FMT_IX },
[BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC }, [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IBC },
[BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC }, [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IBC },
[BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IXC }, [BINARY_OP_SUBTRACT_INT] = { true, INSTR_FMT_IBC },
[BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IXC }, [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IBC },
[BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC }, [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IBC },
[BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IX }, [BINARY_OP_SUBTRACT_FLOAT] = { true, INSTR_FMT_IBC },
[BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC }, [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IBC },
[BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC }, [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IB },
[BINARY_SUBSCR] = { true, INSTR_FMT_IXC }, [BINARY_SUBSCR] = { true, INSTR_FMT_IXC },
[BINARY_SLICE] = { true, INSTR_FMT_IX }, [BINARY_SLICE] = { true, INSTR_FMT_IX },
[STORE_SLICE] = { true, INSTR_FMT_IX }, [STORE_SLICE] = { true, INSTR_FMT_IX },

View file

@ -489,6 +489,7 @@ class MacroInstruction(SuperOrMacroInstruction):
macro: parser.Macro macro: parser.Macro
parts: list[Component | parser.CacheEffect] parts: list[Component | parser.CacheEffect]
predicted: bool = False
@dataclasses.dataclass @dataclasses.dataclass
@ -633,8 +634,8 @@ class Analyzer:
Raises SystemExit if there is an error. Raises SystemExit if there is an error.
""" """
self.find_predictions()
self.analyze_supers_and_macros() self.analyze_supers_and_macros()
self.find_predictions()
self.map_families() self.map_families()
self.check_families() self.check_families()
@ -648,6 +649,8 @@ class Analyzer:
for target in targets: for target in targets:
if target_instr := self.instrs.get(target): if target_instr := self.instrs.get(target):
target_instr.predicted = True target_instr.predicted = True
elif target_macro := self.macro_instrs.get(target):
target_macro.predicted = True
else: else:
self.error( self.error(
f"Unknown instruction {target!r} predicted in {instr.name!r}", f"Unknown instruction {target!r} predicted in {instr.name!r}",
@ -896,6 +899,7 @@ class Analyzer:
pushed = "" pushed = ""
case parser.Super(): case parser.Super():
instr = self.super_instrs[thing.name] instr = self.super_instrs[thing.name]
# TODO: Same as for Macro below, if needed.
popped = "+".join( popped = "+".join(
effect_str(comp.instr.input_effects) for comp in instr.parts effect_str(comp.instr.input_effects) for comp in instr.parts
) )
@ -905,12 +909,30 @@ class Analyzer:
case parser.Macro(): case parser.Macro():
instr = self.macro_instrs[thing.name] instr = self.macro_instrs[thing.name]
parts = [comp for comp in instr.parts if isinstance(comp, Component)] parts = [comp for comp in instr.parts if isinstance(comp, Component)]
popped = "+".join( # Note: stack_analysis() already verifies that macro components
effect_str(comp.instr.input_effects) for comp in parts # have no variable-sized stack effects.
) low = 0
pushed = "+".join( sp = 0
effect_str(comp.instr.output_effects) for comp in parts high = 0
) for comp in parts:
for effect in comp.instr.input_effects:
assert not effect.cond, effect
assert not effect.size, effect
sp -= 1
low = min(low, sp)
for effect in comp.instr.output_effects:
assert not effect.cond, effect
assert not effect.size, effect
sp += 1
high = max(sp, high)
if high != max(0, sp):
# If you get this, intermediate stack growth occurs,
# and stack size calculations may go awry.
# E.g. [push, pop]. The fix would be for stack size
# calculations to use the micro ops.
self.error("Macro has virtual stack growth", thing)
popped = str(-low)
pushed = str(sp - low)
case _: case _:
typing.assert_never(thing) typing.assert_never(thing)
return instr, popped, pushed return instr, popped, pushed
@ -1152,6 +1174,9 @@ class Analyzer:
# outer block, rather than trusting the compiler to optimize it. # outer block, rather than trusting the compiler to optimize it.
self.out.emit("") self.out.emit("")
with self.out.block(f"TARGET({up.name})"): with self.out.block(f"TARGET({up.name})"):
match up:
case MacroInstruction(predicted=True, name=name):
self.out.emit(f"PREDICTED({name});")
for i, var in reversed(list(enumerate(up.stack))): for i, var in reversed(list(enumerate(up.stack))):
src = None src = None
if i < up.initial_sp: if i < up.initial_sp: