Issue #15068: Got rid of excessive buffering in the fileinput module.

The bufsize parameter is no longer used.
This commit is contained in:
Serhiy Storchaka 2016-03-08 18:28:36 +02:00
parent 55e3218eee
commit cc2dbc5844
4 changed files with 153 additions and 91 deletions

View file

@ -64,13 +64,6 @@ deleted when the output file is closed. In-place filtering is
disabled when standard input is read. XXX The current implementation
does not work for MS-DOS 8+3 filesystems.
Performance: this module is unfortunately one of the slower ways of
processing large numbers of input lines. Nevertheless, a significant
speed-up has been obtained by using readlines(bufsize) instead of
readline(). A new keyword argument, bufsize=N, is present on the
input() function and the FileInput() class to override the default
buffer size.
XXX Possible additions:
- optional getopt argument processing
@ -86,6 +79,7 @@ __all__ = ["input", "close", "nextfile", "filename", "lineno", "filelineno",
_state = None
# No longer used
DEFAULT_BUFSIZE = 8*1024
def input(files=None, inplace=False, backup="", bufsize=0,
@ -207,17 +201,15 @@ class FileInput:
self._files = files
self._inplace = inplace
self._backup = backup
self._bufsize = bufsize or DEFAULT_BUFSIZE
self._savestdout = None
self._output = None
self._filename = None
self._lineno = 0
self._startlineno = 0
self._filelineno = 0
self._file = None
self._readline = self._start_readline
self._isstdin = False
self._backupfilename = None
self._buffer = []
self._bufindex = 0
# restrict mode argument to reading modes
if mode not in ('r', 'rU', 'U', 'rb'):
raise ValueError("FileInput opening mode must be one of "
@ -253,22 +245,18 @@ class FileInput:
return self
def __next__(self):
try:
line = self._buffer[self._bufindex]
except IndexError:
pass
else:
self._bufindex += 1
self._lineno += 1
line = self._readline()
if line:
self._filelineno += 1
return line
line = self.readline()
if not line:
if not self._file:
raise StopIteration
return line
self.nextfile()
# Recursive call
return self.__next__()
def __getitem__(self, i):
if i != self._lineno:
if i != self.lineno():
raise RuntimeError("accessing lines out of order")
try:
return self.__next__()
@ -289,6 +277,7 @@ class FileInput:
finally:
file = self._file
self._file = None
self._readline = self._start_readline
try:
if file and not self._isstdin:
file.close()
@ -300,85 +289,81 @@ class FileInput:
except OSError: pass
self._isstdin = False
self._buffer = []
self._bufindex = 0
def readline(self):
try:
line = self._buffer[self._bufindex]
except IndexError:
pass
else:
self._bufindex += 1
self._lineno += 1
self._filelineno += 1
return line
if not self._file:
if not self._files:
if 'b' in self._mode:
return b''
else:
return ''
self._filename = self._files[0]
self._files = self._files[1:]
self._filelineno = 0
self._file = None
self._isstdin = False
self._backupfilename = 0
if self._filename == '-':
self._filename = '<stdin>'
if 'b' in self._mode:
self._file = getattr(sys.stdin, 'buffer', sys.stdin)
else:
self._file = sys.stdin
self._isstdin = True
while True:
line = self._readline()
if line:
self._filelineno += 1
return line
if not self._file:
return line
self.nextfile()
# repeat with next file
def _start_readline(self):
if not self._files:
if 'b' in self._mode:
return b''
else:
if self._inplace:
self._backupfilename = (
self._filename + (self._backup or ".bak"))
return ''
self._filename = self._files[0]
self._files = self._files[1:]
self._startlineno = self.lineno()
self._filelineno = 0
self._file = None
self._isstdin = False
self._backupfilename = 0
if self._filename == '-':
self._filename = '<stdin>'
if 'b' in self._mode:
self._file = getattr(sys.stdin, 'buffer', sys.stdin)
else:
self._file = sys.stdin
self._isstdin = True
else:
if self._inplace:
self._backupfilename = (
self._filename + (self._backup or ".bak"))
try:
os.unlink(self._backupfilename)
except OSError:
pass
# The next few lines may raise OSError
os.rename(self._filename, self._backupfilename)
self._file = open(self._backupfilename, self._mode)
try:
perm = os.fstat(self._file.fileno()).st_mode
except OSError:
self._output = open(self._filename, "w")
else:
mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
if hasattr(os, 'O_BINARY'):
mode |= os.O_BINARY
fd = os.open(self._filename, mode, perm)
self._output = os.fdopen(fd, "w")
try:
os.unlink(self._backupfilename)
if hasattr(os, 'chmod'):
os.chmod(self._filename, perm)
except OSError:
pass
# The next few lines may raise OSError
os.rename(self._filename, self._backupfilename)
self._file = open(self._backupfilename, self._mode)
try:
perm = os.fstat(self._file.fileno()).st_mode
except OSError:
self._output = open(self._filename, "w")
else:
mode = os.O_CREAT | os.O_WRONLY | os.O_TRUNC
if hasattr(os, 'O_BINARY'):
mode |= os.O_BINARY
fd = os.open(self._filename, mode, perm)
self._output = os.fdopen(fd, "w")
try:
if hasattr(os, 'chmod'):
os.chmod(self._filename, perm)
except OSError:
pass
self._savestdout = sys.stdout
sys.stdout = self._output
self._savestdout = sys.stdout
sys.stdout = self._output
else:
# This may raise OSError
if self._openhook:
self._file = self._openhook(self._filename, self._mode)
else:
# This may raise OSError
if self._openhook:
self._file = self._openhook(self._filename, self._mode)
else:
self._file = open(self._filename, self._mode)
self._buffer = self._file.readlines(self._bufsize)
self._bufindex = 0
if not self._buffer:
self.nextfile()
# Recursive call
return self.readline()
self._file = open(self._filename, self._mode)
self._readline = self._file.readline
return self._readline()
def filename(self):
return self._filename
def lineno(self):
return self._lineno
return self._startlineno + self._filelineno
def filelineno(self):
return self._filelineno