gh-117151: IO performance improvement, increase io.DEFAULT_BUFFER_SIZE to 128k (GH-118144)

Co-authored-by: rmorotti <romain.morotti@man.com>
This commit is contained in:
morotti 2025-03-07 19:36:12 +00:00 committed by GitHub
parent 4bf25a0dc8
commit b1b4f9625c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 38 additions and 22 deletions

View file

@ -1405,10 +1405,10 @@ are always available. They are listed here in alphabetical order.
:func:`io.TextIOWrapper.reconfigure`. When no *buffering* argument is :func:`io.TextIOWrapper.reconfigure`. When no *buffering* argument is
given, the default buffering policy works as follows: given, the default buffering policy works as follows:
* Binary files are buffered in fixed-size chunks; the size of the buffer is * Binary files are buffered in fixed-size chunks; the size of the buffer
chosen using a heuristic trying to determine the underlying device's "block is ``max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)``
size" and falling back on :const:`io.DEFAULT_BUFFER_SIZE`. On many systems, when the device block size is available.
the buffer will typically be 4096 or 8192 bytes long. On most systems, the buffer will typically be 128 kilobytes long.
* "Interactive" text files (files for which :meth:`~io.IOBase.isatty` * "Interactive" text files (files for which :meth:`~io.IOBase.isatty`
returns ``True``) use line buffering. Other text files use the policy returns ``True``) use line buffering. Other text files use the policy

View file

@ -23,8 +23,9 @@ if hasattr(os, 'SEEK_HOLE') :
valid_seek_flags.add(os.SEEK_HOLE) valid_seek_flags.add(os.SEEK_HOLE)
valid_seek_flags.add(os.SEEK_DATA) valid_seek_flags.add(os.SEEK_DATA)
# open() uses st_blksize whenever we can # open() uses max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes # when the device block size is available.
DEFAULT_BUFFER_SIZE = 128 * 1024 # bytes
# NOTE: Base classes defined here are registered with the "official" ABCs # NOTE: Base classes defined here are registered with the "official" ABCs
# defined in io.py. We don't use real inheritance though, because we don't want # defined in io.py. We don't use real inheritance though, because we don't want
@ -123,10 +124,10 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
the size of a fixed-size chunk buffer. When no buffering argument is the size of a fixed-size chunk buffer. When no buffering argument is
given, the default buffering policy works as follows: given, the default buffering policy works as follows:
* Binary files are buffered in fixed-size chunks; the size of the buffer * Binary files are buffered in fixed-size chunks; the size of the buffer
is chosen using a heuristic trying to determine the underlying device's is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
"block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. when the device block size is available.
On many systems, the buffer will typically be 4096 or 8192 bytes long. On most systems, the buffer will typically be 128 kilobytes long.
* "Interactive" text files (files for which isatty() returns True) * "Interactive" text files (files for which isatty() returns True)
use line buffering. Other text files use the policy described above use line buffering. Other text files use the policy described above
@ -242,7 +243,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
buffering = -1 buffering = -1
line_buffering = True line_buffering = True
if buffering < 0: if buffering < 0:
buffering = raw._blksize buffering = max(min(raw._blksize, 8192 * 1024), DEFAULT_BUFFER_SIZE)
if buffering < 0: if buffering < 0:
raise ValueError("invalid buffering size") raise ValueError("invalid buffering size")
if buffering == 0: if buffering == 0:

View file

@ -216,6 +216,16 @@ class OtherFileTests:
with self.assertWarnsRegex(RuntimeWarning, 'line buffering'): with self.assertWarnsRegex(RuntimeWarning, 'line buffering'):
self._checkBufferSize(1) self._checkBufferSize(1)
def testDefaultBufferSize(self):
with self.open(TESTFN, 'wb') as f:
blksize = f.raw._blksize
f.write(b"\0" * 5_000_000)
with self.open(TESTFN, 'rb') as f:
data = f.read1()
expected_size = max(min(blksize, 8192 * 1024), io.DEFAULT_BUFFER_SIZE)
self.assertEqual(len(data), expected_size)
def testTruncateOnWindows(self): def testTruncateOnWindows(self):
# SF bug <https://bugs.python.org/issue801631> # SF bug <https://bugs.python.org/issue801631>
# "file.truncate fault on windows" # "file.truncate fault on windows"

View file

@ -0,0 +1,5 @@
Increase ``io.DEFAULT_BUFFER_SIZE`` from 8k to 128k and adjust :func:`open` on
platforms where :meth:`os.fstat` provides a ``st_blksize`` field (such as Linux)
to use ``max(min(blocksize, 8 MiB), io.DEFAULT_BUFFER_SIZE)`` rather
than always using the device block size. This should improve I/O performance.
Patch by Romain Morotti.

View file

@ -60,8 +60,7 @@ PyDoc_STRVAR(module_doc,
"DEFAULT_BUFFER_SIZE\n" "DEFAULT_BUFFER_SIZE\n"
"\n" "\n"
" An int containing the default buffer size used by the module's buffered\n" " An int containing the default buffer size used by the module's buffered\n"
" I/O classes. open() uses the file's blksize (as obtained by os.stat) if\n" " I/O classes.\n"
" possible.\n"
); );
@ -132,9 +131,9 @@ the size of a fixed-size chunk buffer. When no buffering argument is
given, the default buffering policy works as follows: given, the default buffering policy works as follows:
* Binary files are buffered in fixed-size chunks; the size of the buffer * Binary files are buffered in fixed-size chunks; the size of the buffer
is chosen using a heuristic trying to determine the underlying device's is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)
"block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. when the device block size is available.
On many systems, the buffer will typically be 4096 or 8192 bytes long. On most systems, the buffer will typically be 128 kilobytes long.
* "Interactive" text files (files for which isatty() returns True) * "Interactive" text files (files for which isatty() returns True)
use line buffering. Other text files use the policy described above use line buffering. Other text files use the policy described above
@ -200,7 +199,7 @@ static PyObject *
_io_open_impl(PyObject *module, PyObject *file, const char *mode, _io_open_impl(PyObject *module, PyObject *file, const char *mode,
int buffering, const char *encoding, const char *errors, int buffering, const char *encoding, const char *errors,
const char *newline, int closefd, PyObject *opener) const char *newline, int closefd, PyObject *opener)
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=cd034e7cdfbf4e78]*/ /*[clinic end generated code: output=aefafc4ce2b46dc0 input=28027fdaabb8d744]*/
{ {
size_t i; size_t i;
@ -371,6 +370,7 @@ _io_open_impl(PyObject *module, PyObject *file, const char *mode,
Py_DECREF(blksize_obj); Py_DECREF(blksize_obj);
if (buffering == -1 && PyErr_Occurred()) if (buffering == -1 && PyErr_Occurred())
goto error; goto error;
buffering = Py_MAX(Py_MIN(buffering, 8192 * 1024), DEFAULT_BUFFER_SIZE);
} }
if (buffering < 0) { if (buffering < 0) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,

View file

@ -78,7 +78,7 @@ extern Py_ssize_t _PyIO_find_line_ending(
*/ */
extern int _PyIO_trap_eintr(void); extern int _PyIO_trap_eintr(void);
#define DEFAULT_BUFFER_SIZE (8 * 1024) /* bytes */ #define DEFAULT_BUFFER_SIZE (128 * 1024) /* bytes */
/* /*
* Offset type for positioning. * Offset type for positioning.

View file

@ -64,9 +64,9 @@ PyDoc_STRVAR(_io_open__doc__,
"given, the default buffering policy works as follows:\n" "given, the default buffering policy works as follows:\n"
"\n" "\n"
"* Binary files are buffered in fixed-size chunks; the size of the buffer\n" "* Binary files are buffered in fixed-size chunks; the size of the buffer\n"
" is chosen using a heuristic trying to determine the underlying device\'s\n" " is max(min(blocksize, 8 MiB), DEFAULT_BUFFER_SIZE)\n"
" \"block size\" and falling back on `io.DEFAULT_BUFFER_SIZE`.\n" " when the device block size is available.\n"
" On many systems, the buffer will typically be 4096 or 8192 bytes long.\n" " On most systems, the buffer will typically be 128 kilobytes long.\n"
"\n" "\n"
"* \"Interactive\" text files (files for which isatty() returns True)\n" "* \"Interactive\" text files (files for which isatty() returns True)\n"
" use line buffering. Other text files use the policy described above\n" " use line buffering. Other text files use the policy described above\n"
@ -406,4 +406,4 @@ _io_open_code(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec
exit: exit:
return return_value; return return_value;
} }
/*[clinic end generated code: output=ec1df2ff5265ab16 input=a9049054013a1b77]*/ /*[clinic end generated code: output=2eaf6e914503bcfd input=a9049054013a1b77]*/