bpo-44187: Quickening infrastructure (GH-26264)

* Add co_firstinstr field to code object.

* Implement barebones quickening.

* Use non-quickened bytecode when tracing.

* Add NEWS item

* Add new file to Windows build.

* Don't specialize instructions with EXTENDED_ARG.
This commit is contained in:
Mark Shannon 2021-06-07 18:38:06 +01:00 committed by GitHub
parent 89e50ab36f
commit 001eb520b5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 416 additions and 12 deletions

View file

@ -4,6 +4,7 @@
extern "C" {
#endif
/* Legacy Opcache */
typedef struct {
PyObject *ptr; /* Cached pointer (borrowed reference) */
@ -26,6 +27,129 @@ struct _PyOpcache {
};
/* PEP 659
* Specialization and quickening structs and helper functions
*/
typedef struct {
int32_t cache_count;
int32_t _; /* Force 8 byte size */
} _PyEntryZero;
typedef struct {
uint8_t original_oparg;
uint8_t counter;
uint16_t index;
} _PyAdaptiveEntry;
/* Add specialized versions of entries to this union.
*
* Do not break the invariant: sizeof(SpecializedCacheEntry) == 8
* Preserving this invariant is necessary because:
- If any one form uses more space, then all must and on 64 bit machines
this is likely to double the memory consumption of caches
- The function for calculating the offset of caches assumes a 4:1
cache:instruction size ratio. Changing that would need careful
analysis to choose a new function.
*/
typedef union {
_PyEntryZero zero;
_PyAdaptiveEntry adaptive;
} SpecializedCacheEntry;
#define INSTRUCTIONS_PER_ENTRY (sizeof(SpecializedCacheEntry)/sizeof(_Py_CODEUNIT))
/* Maximum size of code to quicken, in code units. */
#define MAX_SIZE_TO_QUICKEN 5000
typedef union _cache_or_instruction {
_Py_CODEUNIT code[1];
SpecializedCacheEntry entry;
} SpecializedCacheOrInstruction;
/* Get pointer to the nth cache entry, from the first instruction and n.
* Cache entries are indexed backwards, with [count-1] first in memory, and [0] last.
* The zeroth entry immediately precedes the instructions.
*/
static inline SpecializedCacheEntry *
_GetSpecializedCacheEntry(_Py_CODEUNIT *first_instr, Py_ssize_t n)
{
SpecializedCacheOrInstruction *last_cache_plus_one = (SpecializedCacheOrInstruction *)first_instr;
assert(&last_cache_plus_one->code[0] == first_instr);
return &last_cache_plus_one[-1-n].entry;
}
/* Following two functions form a pair.
*
* oparg_from_offset_and_index() is used to compute the oparg
* when quickening, so that offset_from_oparg_and_nexti()
* can be used at runtime to compute the offset.
*
* The relationship between the three values is currently
* offset == (index>>1) + oparg
* This relation is chosen based on the following observations:
* 1. typically 1 in 4 instructions need a cache
* 2. instructions that need a cache typically use 2 entries
* These observations imply: offset index/2
* We use the oparg to fine tune the relation to avoid wasting space
* and allow consecutive instructions to use caches.
*
* If the number of cache entries < number of instructions/2 we will waste
* some small amoount of space.
* If the number of cache entries > (number of instructions/2) + 255, then
* some instructions will not be able to use a cache.
* In practice, we expect some small amount of wasted space in a shorter functions
* and only functions exceeding a 1000 lines or more not to have enugh cache space.
*
*/
static inline int
oparg_from_offset_and_nexti(int offset, int nexti)
{
return offset-(nexti>>1);
}
static inline int
offset_from_oparg_and_nexti(int oparg, int nexti)
{
return (nexti>>1)+oparg;
}
/* Get pointer to the cache entry associated with an instruction.
* nexti is the index of the instruction plus one.
* nexti is used as it corresponds to the instruction pointer in the interpreter.
* This doesn't check that an entry has been allocated for that instruction. */
static inline SpecializedCacheEntry *
_GetSpecializedCacheEntryForInstruction(_Py_CODEUNIT *first_instr, int nexti, int oparg)
{
return _GetSpecializedCacheEntry(
first_instr,
offset_from_oparg_and_nexti(oparg, nexti)
);
}
#define QUICKENING_WARMUP_DELAY 8
/* We want to compare to zero for efficiency, so we offset values accordingly */
#define QUICKENING_INITIAL_WARMUP_VALUE (-QUICKENING_WARMUP_DELAY)
#define QUICKENING_WARMUP_COLDEST 1
static inline void
PyCodeObject_IncrementWarmup(PyCodeObject * co)
{
co->co_warmup++;
}
/* Used by the interpreter to determine when a code object should be quickened */
static inline int
PyCodeObject_IsWarmedUp(PyCodeObject * co)
{
return (co->co_warmup == 0);
}
int _Py_Quicken(PyCodeObject *code);
extern Py_ssize_t _Py_QuickenedCount;
struct _PyCodeConstructor {
/* metadata */
PyObject *filename;