gh-89727: Fix os.walk RecursionError on deep trees (#99803)

Use a stack to implement os.walk iteratively instead of recursively to
avoid hitting recursion limits on deeply nested trees.
This commit is contained in:
Jon Burdo 2022-12-19 13:59:01 -05:00 committed by GitHub
parent 702a5bc463
commit 797edb28c3
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 137 additions and 81 deletions

156
Lib/os.py
View file

@ -340,89 +340,95 @@ def walk(top, topdown=True, onerror=None, followlinks=False):
"""
sys.audit("os.walk", top, topdown, onerror, followlinks)
return _walk(fspath(top), topdown, onerror, followlinks)
def _walk(top, topdown, onerror, followlinks):
dirs = []
nondirs = []
walk_dirs = []
stack = [(False, fspath(top))]
islink, join = path.islink, path.join
while stack:
must_yield, top = stack.pop()
if must_yield:
yield top
continue
# We may not have read permission for top, in which case we can't
# get a list of the files the directory contains. os.walk
# always suppressed the exception then, rather than blow up for a
# minor reason when (say) a thousand readable directories are still
# left to visit. That logic is copied here.
try:
# Note that scandir is global in this module due
# to earlier import-*.
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
return
dirs = []
nondirs = []
walk_dirs = []
with scandir_it:
while True:
try:
# We may not have read permission for top, in which case we can't
# get a list of the files the directory contains.
# We suppress the exception here, rather than blow up for a
# minor reason when (say) a thousand readable directories are still
# left to visit.
try:
scandir_it = scandir(top)
except OSError as error:
if onerror is not None:
onerror(error)
continue
cont = False
with scandir_it:
while True:
try:
entry = next(scandir_it)
except StopIteration:
break
except OSError as error:
if onerror is not None:
onerror(error)
return
try:
is_dir = entry.is_dir()
except OSError:
# If is_dir() raises an OSError, consider that the entry is not
# a directory, same behaviour than os.path.isdir().
is_dir = False
if is_dir:
dirs.append(entry.name)
else:
nondirs.append(entry.name)
if not topdown and is_dir:
# Bottom-up: recurse into sub-directory, but exclude symlinks to
# directories if followlinks is False
if followlinks:
walk_into = True
else:
try:
is_symlink = entry.is_symlink()
except OSError:
# If is_symlink() raises an OSError, consider that the
# entry is not a symbolic link, same behaviour than
# os.path.islink().
is_symlink = False
walk_into = not is_symlink
entry = next(scandir_it)
except StopIteration:
break
except OSError as error:
if onerror is not None:
onerror(error)
cont = True
break
if walk_into:
walk_dirs.append(entry.path)
try:
is_dir = entry.is_dir()
except OSError:
# If is_dir() raises an OSError, consider the entry not to
# be a directory, same behaviour as os.path.isdir().
is_dir = False
# Yield before recursion if going top down
if topdown:
yield top, dirs, nondirs
if is_dir:
dirs.append(entry.name)
else:
nondirs.append(entry.name)
# Recurse into sub-directories
islink, join = path.islink, path.join
for dirname in dirs:
new_path = join(top, dirname)
# Issue #23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the "yield"
# above.
if followlinks or not islink(new_path):
yield from _walk(new_path, topdown, onerror, followlinks)
else:
# Recurse into sub-directories
for new_path in walk_dirs:
yield from _walk(new_path, topdown, onerror, followlinks)
# Yield after recursion if going bottom up
yield top, dirs, nondirs
if not topdown and is_dir:
# Bottom-up: traverse into sub-directory, but exclude
# symlinks to directories if followlinks is False
if followlinks:
walk_into = True
else:
try:
is_symlink = entry.is_symlink()
except OSError:
# If is_symlink() raises an OSError, consider the
# entry not to be a symbolic link, same behaviour
# as os.path.islink().
is_symlink = False
walk_into = not is_symlink
if walk_into:
walk_dirs.append(entry.path)
if cont:
continue
if topdown:
# Yield before sub-directory traversal if going top down
yield top, dirs, nondirs
# Traverse into sub-directories
for dirname in reversed(dirs):
new_path = join(top, dirname)
# bpo-23605: os.path.islink() is used instead of caching
# entry.is_symlink() result during the loop on os.scandir() because
# the caller can replace the directory entry during the "yield"
# above.
if followlinks or not islink(new_path):
stack.append((False, new_path))
else:
# Yield after sub-directory traversal if going bottom up
stack.append((True, (top, dirs, nondirs)))
# Traverse into sub-directories
for new_path in reversed(walk_dirs):
stack.append((False, new_path))
__all__.append("walk")