gh-108724: Add PyMutex and _PyParkingLot APIs (gh-109344)

PyMutex is a one byte lock with fast, inlineable lock and unlock functions for the common uncontended case.  The design is based on WebKit's WTF::Lock.

PyMutex is built using the _PyParkingLot APIs, which provides a cross-platform futex-like API (based on WebKit's WTF::ParkingLot).  This internal API will be used for building other synchronization primitives used to implement PEP 703, such as one-time initialization and events.

This also includes tests and a mini benchmark in Tools/lockbench/lockbench.py to compare with the existing PyThread_type_lock.

Uncontended acquisition + release:
* Linux (x86-64): PyMutex: 11 ns, PyThread_type_lock: 44 ns
* macOS (arm64): PyMutex: 13 ns, PyThread_type_lock: 18 ns
* Windows (x86-64): PyMutex: 13 ns, PyThread_type_lock: 38 ns

PR Overview:

The primary purpose of this PR is to implement PyMutex, but there are a number of support pieces (described below).

* PyMutex:  A 1-byte lock that doesn't require memory allocation to initialize and is generally faster than the existing PyThread_type_lock.  The API is internal only for now.
* _PyParking_Lot:  A futex-like API based on the API of the same name in WebKit.  Used to implement PyMutex.
* _PyRawMutex:  A word sized lock used to implement _PyParking_Lot.
* PyEvent:  A one time event.  This was used a bunch in the "nogil" fork and is useful for testing the PyMutex implementation, so I've included it as part of the PR.
* pycore_llist.h:  Defines common operations on doubly-linked list.  Not strictly necessary (could do the list operations manually), but they come up frequently in the "nogil" fork. ( Similar to https://man.freebsd.org/cgi/man.cgi?queue)

---------

Co-authored-by: Eric Snow <ericsnowcurrently@gmail.com>
This commit is contained in:
Sam Gross 2023-09-19 11:54:29 -04:00 committed by GitHub
parent 0a31ff0050
commit 0c89056fe5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 1665 additions and 21 deletions

View file

@ -48,6 +48,7 @@
#include "pytypedefs.h"
#include "pybuffer.h"
#include "pystats.h"
#include "pyatomic.h"
#include "object.h"
#include "objimpl.h"
#include "typeslots.h"

View file

@ -83,9 +83,9 @@
// # release
// ...
#ifndef Py_ATOMIC_H
#define Py_ATOMIC_H
#ifndef Py_CPYTHON_ATOMIC_H
# error "this header file must not be included directly"
#endif
// --- _Py_atomic_add --------------------------------------------------------
// Atomically adds `value` to `obj` and returns the previous value
@ -501,6 +501,3 @@ static inline void _Py_atomic_fence_release(void);
#else
# error "no available pyatomic implementation for this platform/compiler"
#endif
#endif /* Py_ATOMIC_H */

View file

@ -906,7 +906,7 @@ _Py_atomic_store_ptr_release(void *obj, void *value)
#if defined(_M_X64) || defined(_M_IX86)
*(void * volatile *)obj = value;
#elif defined(_M_ARM64)
__stlr64(obj, (uintptr_t)value);
__stlr64((unsigned __int64 volatile *)obj, (uintptr_t)value);
#else
# error "no implementation of _Py_atomic_store_ptr_release"
#endif

View file

@ -0,0 +1,107 @@
// A doubly-linked list that can be embedded in a struct.
//
// Usage:
// struct llist_node head = LLIST_INIT(head);
// typedef struct {
// ...
// struct llist_node node;
// ...
// } MyObj;
//
// llist_insert_tail(&head, &obj->node);
// llist_remove(&obj->node);
//
// struct llist_node *node;
// llist_for_each(node, &head) {
// MyObj *obj = llist_data(node, MyObj, node);
// ...
// }
//
#ifndef Py_INTERNAL_LLIST_H
#define Py_INTERNAL_LLIST_H
#include <stddef.h>
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_BUILD_CORE
# error "Py_BUILD_CORE must be defined to include this header"
#endif
struct llist_node {
struct llist_node *next;
struct llist_node *prev;
};
// Get the struct containing a node.
#define llist_data(node, type, member) \
(type*)((char*)node - offsetof(type, member))
// Iterate over a list.
#define llist_for_each(node, head) \
for (node = (head)->next; node != (head); node = node->next)
// Iterate over a list, but allow removal of the current node.
#define llist_for_each_safe(node, head) \
for (struct llist_node *_next = (node = (head)->next, node->next); \
node != (head); node = _next, _next = node->next)
#define LLIST_INIT(head) { &head, &head }
static inline void
llist_init(struct llist_node *head)
{
head->next = head;
head->prev = head;
}
// Returns 1 if the list is empty, 0 otherwise.
static inline int
llist_empty(struct llist_node *head)
{
return head->next == head;
}
// Appends to the tail of the list.
static inline void
llist_insert_tail(struct llist_node *head, struct llist_node *node)
{
node->prev = head->prev;
node->next = head;
head->prev->next = node;
head->prev = node;
}
// Remove a node from the list.
static inline void
llist_remove(struct llist_node *node)
{
struct llist_node *prev = node->prev;
struct llist_node *next = node->next;
prev->next = next;
next->prev = prev;
node->prev = NULL;
node->next = NULL;
}
// Append all nodes from head2 onto head1. head2 is left empty.
static inline void
llist_concat(struct llist_node *head1, struct llist_node *head2)
{
if (!llist_empty(head2)) {
head1->prev->next = head2->next;
head2->next->prev = head1->prev;
head1->prev = head2->prev;
head2->prev->next = head1;
llist_init(head2);
}
}
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_LLIST_H */

View file

@ -0,0 +1,158 @@
// Lightweight locks and other synchronization mechanisms.
//
// These implementations are based on WebKit's WTF::Lock. See
// https://webkit.org/blog/6161/locking-in-webkit/ for a description of the
// design.
#ifndef Py_INTERNAL_LOCK_H
#define Py_INTERNAL_LOCK_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
#include "pycore_time.h" // _PyTime_t
// A mutex that occupies one byte. The lock can be zero initialized.
//
// Only the two least significant bits are used. The remaining bits should be
// zero:
// 0b00: unlocked
// 0b01: locked
// 0b10: unlocked and has parked threads
// 0b11: locked and has parked threads
//
// Typical initialization:
// PyMutex m = (PyMutex){0};
//
// Typical usage:
// PyMutex_Lock(&m);
// ...
// PyMutex_Unlock(&m);
typedef struct _PyMutex {
uint8_t v;
} PyMutex;
#define _Py_UNLOCKED 0
#define _Py_LOCKED 1
#define _Py_HAS_PARKED 2
// (private) slow path for locking the mutex
PyAPI_FUNC(void) _PyMutex_LockSlow(PyMutex *m);
// (private) slow path for unlocking the mutex
PyAPI_FUNC(void) _PyMutex_UnlockSlow(PyMutex *m);
// Locks the mutex.
//
// If the mutex is currently locked, the calling thread will be parked until
// the mutex is unlocked. If the current thread holds the GIL, then the GIL
// will be released while the thread is parked.
static inline void
PyMutex_Lock(PyMutex *m)
{
uint8_t expected = _Py_UNLOCKED;
if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_LOCKED)) {
_PyMutex_LockSlow(m);
}
}
// Unlocks the mutex.
static inline void
PyMutex_Unlock(PyMutex *m)
{
uint8_t expected = _Py_LOCKED;
if (!_Py_atomic_compare_exchange_uint8(&m->v, &expected, _Py_UNLOCKED)) {
_PyMutex_UnlockSlow(m);
}
}
// Checks if the mutex is currently locked.
static inline int
PyMutex_IsLocked(PyMutex *m)
{
return (_Py_atomic_load_uint8(&m->v) & _Py_LOCKED) != 0;
}
typedef enum _PyLockFlags {
// Do not detach/release the GIL when waiting on the lock.
_Py_LOCK_DONT_DETACH = 0,
// Detach/release the GIL while waiting on the lock.
_PY_LOCK_DETACH = 1,
// Handle signals if interrupted while waiting on the lock.
_PY_LOCK_HANDLE_SIGNALS = 2,
} _PyLockFlags;
// Lock a mutex with an optional timeout and additional options. See
// _PyLockFlags for details.
extern PyLockStatus
_PyMutex_LockTimed(PyMutex *m, _PyTime_t timeout_ns, _PyLockFlags flags);
// Unlock a mutex, returns 0 if the mutex is not locked (used for improved
// error messages).
extern int _PyMutex_TryUnlock(PyMutex *m);
// PyEvent is a one-time event notification
typedef struct {
uint8_t v;
} PyEvent;
// Set the event and notify any waiting threads.
// Export for '_testinternalcapi' shared extension
PyAPI_FUNC(void) _PyEvent_Notify(PyEvent *evt);
// Wait for the event to be set. If the event is already set, then this returns
// immediately.
PyAPI_FUNC(void) PyEvent_Wait(PyEvent *evt);
// Wait for the event to be set, or until the timeout expires. If the event is
// already set, then this returns immediately. Returns 1 if the event was set,
// and 0 if the timeout expired or thread was interrupted.
PyAPI_FUNC(int) PyEvent_WaitTimed(PyEvent *evt, _PyTime_t timeout_ns);
// _PyRawMutex implements a word-sized mutex that that does not depend on the
// parking lot API, and therefore can be used in the parking lot
// implementation.
//
// The mutex uses a packed representation: the least significant bit is used to
// indicate whether the mutex is locked or not. The remaining bits are either
// zero or a pointer to a `struct raw_mutex_entry` (see lock.c).
typedef struct {
uintptr_t v;
} _PyRawMutex;
// Slow paths for lock/unlock
extern void _PyRawMutex_LockSlow(_PyRawMutex *m);
extern void _PyRawMutex_UnlockSlow(_PyRawMutex *m);
static inline void
_PyRawMutex_Lock(_PyRawMutex *m)
{
uintptr_t unlocked = _Py_UNLOCKED;
if (_Py_atomic_compare_exchange_uintptr(&m->v, &unlocked, _Py_LOCKED)) {
return;
}
_PyRawMutex_LockSlow(m);
}
static inline void
_PyRawMutex_Unlock(_PyRawMutex *m)
{
uintptr_t locked = _Py_LOCKED;
if (_Py_atomic_compare_exchange_uintptr(&m->v, &locked, _Py_UNLOCKED)) {
return;
}
_PyRawMutex_UnlockSlow(m);
}
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_LOCK_H */

View file

@ -0,0 +1,99 @@
// ParkingLot is an internal API for building efficient synchronization
// primitives like mutexes and events.
//
// The API and name is inspired by WebKit's WTF::ParkingLot, which in turn
// is inspired Linux's futex API.
// See https://webkit.org/blog/6161/locking-in-webkit/.
//
// The core functionality is an atomic "compare-and-sleep" operation along with
// an atomic "wake-up" operation.
#ifndef Py_INTERNAL_PARKING_LOT_H
#define Py_INTERNAL_PARKING_LOT_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
#include "pycore_time.h" // _PyTime_t
enum {
// The thread was unparked by another thread.
Py_PARK_OK = 0,
// The value of `address` did not match `expected`.
Py_PARK_AGAIN = -1,
// The thread was unparked due to a timeout.
Py_PARK_TIMEOUT = -2,
// The thread was interrupted by a signal.
Py_PARK_INTR = -3,
};
// Checks that `*address == *expected` and puts the thread to sleep until an
// unpark operation is called on the same `address`. Otherwise, the function
// returns `Py_PARK_AGAIN`. The comparison behaves like memcmp, but is
// performed atomically with respect to unpark operations.
//
// The `address_size` argument is the size of the data pointed to by the
// `address` and `expected` pointers (i.e., sizeof(*address)). It must be
// 1, 2, 4, or 8.
//
// The `timeout_ns` argument specifies the maximum amount of time to wait, with
// -1 indicating an infinite wait.
//
// `park_arg`, which can be NULL, is passed to the unpark operation.
//
// If `detach` is true, then the thread will detach/release the GIL while
// waiting.
//
// Example usage:
//
// if (_Py_atomic_compare_exchange_uint8(address, &expected, new_value)) {
// int res = _PyParkingLot_Park(address, &new_value, sizeof(*address),
// timeout_ns, NULL, 1);
// ...
// }
PyAPI_FUNC(int)
_PyParkingLot_Park(const void *address, const void *expected,
size_t address_size, _PyTime_t timeout_ns,
void *park_arg, int detach);
// Callback for _PyParkingLot_Unpark:
//
// `arg` is the data of the same name provided to the _PyParkingLot_Unpark()
// call.
// `park_arg` is the data provided to _PyParkingLot_Park() call or NULL if
// no waiting thread was found.
// `has_more_waiters` is true if there are more threads waiting on the same
// address. May be true in cases where threads are waiting on a different
// address that map to the same internal bucket.
typedef void _Py_unpark_fn_t(void *arg, void *park_arg, int has_more_waiters);
// Unparks a single thread waiting on `address`.
//
// Note that fn() is called regardless of whether a thread was unparked. If
// no threads are waiting on `address` then the `park_arg` argument to fn()
// will be NULL.
//
// Example usage:
// void callback(void *arg, void *park_arg, int has_more_waiters);
// _PyParkingLot_Unpark(address, &callback, arg);
PyAPI_FUNC(void)
_PyParkingLot_Unpark(const void *address, _Py_unpark_fn_t *fn, void *arg);
// Unparks all threads waiting on `address`.
PyAPI_FUNC(void) _PyParkingLot_UnparkAll(const void *address);
// Resets the parking lot state after a fork. Forgets all parked threads.
PyAPI_FUNC(void) _PyParkingLot_AfterFork(void);
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_PARKING_LOT_H */

View file

@ -0,0 +1,63 @@
// The _PySemaphore API a simplified cross-platform semaphore used to implement
// wakeup/sleep.
#ifndef Py_INTERNAL_SEMAPHORE_H
#define Py_INTERNAL_SEMAPHORE_H
#ifndef Py_BUILD_CORE
# error "this header requires Py_BUILD_CORE define"
#endif
#include "pycore_time.h" // _PyTime_t
#ifdef MS_WINDOWS
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#elif defined(HAVE_PTHREAD_H)
# include <pthread.h>
#elif defined(HAVE_PTHREAD_STUBS)
# include "cpython/pthread_stubs.h"
#else
# error "Require native threads. See https://bugs.python.org/issue31370"
#endif
#if defined(_POSIX_SEMAPHORES) && (_POSIX_SEMAPHORES+0) != -1
# define _Py_USE_SEMAPHORES
# include <semaphore.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
typedef struct _PySemaphore {
#if defined(MS_WINDOWS)
HANDLE platform_sem;
#elif defined(_Py_USE_SEMAPHORES)
sem_t platform_sem;
#else
pthread_mutex_t mutex;
pthread_cond_t cond;
int counter;
#endif
} _PySemaphore;
// Puts the current thread to sleep until _PySemaphore_Wakeup() is called.
// If `detach` is true, then the thread will detach/release the GIL while
// sleeping.
PyAPI_FUNC(int)
_PySemaphore_Wait(_PySemaphore *sema, _PyTime_t timeout_ns, int detach);
// Wakes up a single thread waiting on sema. Note that _PySemaphore_Wakeup()
// can be called before _PySemaphore_Wait().
PyAPI_FUNC(void)
_PySemaphore_Wakeup(_PySemaphore *sema);
// Initializes/destroys a semaphore
PyAPI_FUNC(void) _PySemaphore_Init(_PySemaphore *sema);
PyAPI_FUNC(void) _PySemaphore_Destroy(_PySemaphore *sema);
#ifdef __cplusplus
}
#endif
#endif /* !Py_INTERNAL_SEMAPHORE_H */

16
Include/pyatomic.h Normal file
View file

@ -0,0 +1,16 @@
#ifndef Py_ATOMIC_H
#define Py_ATOMIC_H
#ifdef __cplusplus
extern "C" {
#endif
#ifndef Py_LIMITED_API
# define Py_CPYTHON_ATOMIC_H
# include "cpython/pyatomic.h"
# undef Py_CPYTHON_ATOMIC_H
#endif
#ifdef __cplusplus
}
#endif
#endif /* !Py_ATOMIC_H */