GH-101362: Omit path anchor from pathlib.PurePath()._parts (GH-102476)

Improve performance of path construction by skipping the addition of the path anchor (`drive + root`) to the internal `_parts` list. Rename this attribute to `_tail` for clarity.
This commit is contained in:
Barney Gale 2023-04-09 18:40:03 +01:00 committed by GitHub
parent 0a675f4bb5
commit 2c673d5e93
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 108 additions and 67 deletions

View file

@ -210,20 +210,17 @@ class _RecursiveWildcardSelector(_Selector):
class _PathParents(Sequence): class _PathParents(Sequence):
"""This object provides sequence-like access to the logical ancestors """This object provides sequence-like access to the logical ancestors
of a path. Don't try to construct it yourself.""" of a path. Don't try to construct it yourself."""
__slots__ = ('_pathcls', '_drv', '_root', '_parts') __slots__ = ('_pathcls', '_drv', '_root', '_tail')
def __init__(self, path): def __init__(self, path):
# We don't store the instance to avoid reference cycles # We don't store the instance to avoid reference cycles
self._pathcls = type(path) self._pathcls = type(path)
self._drv = path.drive self._drv = path.drive
self._root = path.root self._root = path.root
self._parts = path._parts self._tail = path._tail
def __len__(self): def __len__(self):
if self._drv or self._root: return len(self._tail)
return len(self._parts) - 1
else:
return len(self._parts)
def __getitem__(self, idx): def __getitem__(self, idx):
if isinstance(idx, slice): if isinstance(idx, slice):
@ -234,7 +231,7 @@ class _PathParents(Sequence):
if idx < 0: if idx < 0:
idx += len(self) idx += len(self)
return self._pathcls._from_parsed_parts(self._drv, self._root, return self._pathcls._from_parsed_parts(self._drv, self._root,
self._parts[:-idx - 1]) self._tail[:-idx - 1])
def __repr__(self): def __repr__(self):
return "<{}.parents>".format(self._pathcls.__name__) return "<{}.parents>".format(self._pathcls.__name__)
@ -249,9 +246,41 @@ class PurePath(object):
PureWindowsPath object. You can also instantiate either of these classes PureWindowsPath object. You can also instantiate either of these classes
directly, regardless of your system. directly, regardless of your system.
""" """
__slots__ = ( __slots__ = (
'_raw_path', '_drv', '_root', '_parts_cached', # The `_raw_path` slot stores an unnormalized string path. This is set
'_str', '_hash', '_parts_tuple', '_parts_normcase_cached', # in the `__init__()` method.
'_raw_path',
# The `_drv`, `_root` and `_tail_cached` slots store parsed and
# normalized parts of the path. They are set when any of the `drive`,
# `root` or `_tail` properties are accessed for the first time. The
# three-part division corresponds to the result of
# `os.path.splitroot()`, except that the tail is further split on path
# separators (i.e. it is a list of strings), and that the root and
# tail are normalized.
'_drv', '_root', '_tail_cached',
# The `_str` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',
# The `_str_normcase_cached` slot stores the string path with
# normalized case. It is set when the `_str_normcase` property is
# accessed for the first time. It's used to implement `__eq__()`
# `__hash__()`, and `_parts_normcase`
'_str_normcase_cached',
# The `_parts_normcase_cached` slot stores the case-normalized
# string path after splitting on path separators. It's set when the
# `_parts_normcase` property is accessed for the first time. It's used
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',
# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
) )
_flavour = os.path _flavour = os.path
@ -277,10 +306,7 @@ class PurePath(object):
path = os.fspath(args[0]) path = os.fspath(args[0])
else: else:
path = self._flavour.join(*args) path = self._flavour.join(*args)
if isinstance(path, str): if not isinstance(path, str):
# Force-cast str subclasses to str (issue #21127)
path = str(path)
else:
raise TypeError( raise TypeError(
"argument should be a str or an os.PathLike " "argument should be a str or an os.PathLike "
"object where __fspath__ returns a str, " "object where __fspath__ returns a str, "
@ -299,33 +325,32 @@ class PurePath(object):
if drv.startswith(sep): if drv.startswith(sep):
# pathlib assumes that UNC paths always have a root. # pathlib assumes that UNC paths always have a root.
root = sep root = sep
unfiltered_parsed = [drv + root] + rel.split(sep) parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.']
return drv, root, parsed return drv, root, parsed
def _load_parts(self): def _load_parts(self):
drv, root, parts = self._parse_path(self._raw_path) drv, root, tail = self._parse_path(self._raw_path)
self._drv = drv self._drv = drv
self._root = root self._root = root
self._parts_cached = parts self._tail_cached = tail
@classmethod @classmethod
def _from_parsed_parts(cls, drv, root, parts): def _from_parsed_parts(cls, drv, root, tail):
path = cls._format_parsed_parts(drv, root, parts) path = cls._format_parsed_parts(drv, root, tail)
self = cls(path) self = cls(path)
self._str = path or '.' self._str = path or '.'
self._drv = drv self._drv = drv
self._root = root self._root = root
self._parts_cached = parts self._tail_cached = tail
return self return self
@classmethod @classmethod
def _format_parsed_parts(cls, drv, root, parts): def _format_parsed_parts(cls, drv, root, tail):
if drv or root: if drv or root:
return drv + root + cls._flavour.sep.join(parts[1:]) return drv + root + cls._flavour.sep.join(tail)
elif parts and cls._flavour.splitdrive(parts[0])[0]: elif tail and cls._flavour.splitdrive(tail[0])[0]:
parts = ['.'] + parts tail = ['.'] + tail
return cls._flavour.sep.join(parts) return cls._flavour.sep.join(tail)
def __str__(self): def __str__(self):
"""Return the string representation of the path, suitable for """Return the string representation of the path, suitable for
@ -334,7 +359,7 @@ class PurePath(object):
return self._str return self._str
except AttributeError: except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root, self._str = self._format_parsed_parts(self.drive, self.root,
self._parts) or '.' self._tail) or '.'
return self._str return self._str
def __fspath__(self): def __fspath__(self):
@ -374,25 +399,34 @@ class PurePath(object):
path = str(self) path = str(self)
return prefix + urlquote_from_bytes(os.fsencode(path)) return prefix + urlquote_from_bytes(os.fsencode(path))
@property
def _str_normcase(self):
# String with normalized case, for hashing and equality checks
try:
return self._str_normcase_cached
except AttributeError:
self._str_normcase_cached = self._flavour.normcase(str(self))
return self._str_normcase_cached
@property @property
def _parts_normcase(self): def _parts_normcase(self):
# Cached parts with normalized case, for hashing and comparison. # Cached parts with normalized case, for comparisons.
try: try:
return self._parts_normcase_cached return self._parts_normcase_cached
except AttributeError: except AttributeError:
self._parts_normcase_cached = [self._flavour.normcase(p) for p in self._parts] self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached return self._parts_normcase_cached
def __eq__(self, other): def __eq__(self, other):
if not isinstance(other, PurePath): if not isinstance(other, PurePath):
return NotImplemented return NotImplemented
return self._parts_normcase == other._parts_normcase and self._flavour is other._flavour return self._str_normcase == other._str_normcase and self._flavour is other._flavour
def __hash__(self): def __hash__(self):
try: try:
return self._hash return self._hash
except AttributeError: except AttributeError:
self._hash = hash(tuple(self._parts_normcase)) self._hash = hash(self._str_normcase)
return self._hash return self._hash
def __lt__(self, other): def __lt__(self, other):
@ -434,12 +468,12 @@ class PurePath(object):
return self._root return self._root
@property @property
def _parts(self): def _tail(self):
try: try:
return self._parts_cached return self._tail_cached
except AttributeError: except AttributeError:
self._load_parts() self._load_parts()
return self._parts_cached return self._tail_cached
@property @property
def anchor(self): def anchor(self):
@ -450,10 +484,10 @@ class PurePath(object):
@property @property
def name(self): def name(self):
"""The final path component, if any.""" """The final path component, if any."""
parts = self._parts tail = self._tail
if len(parts) == (1 if (self.drive or self.root) else 0): if not tail:
return '' return ''
return parts[-1] return tail[-1]
@property @property
def suffix(self): def suffix(self):
@ -501,7 +535,7 @@ class PurePath(object):
if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail): if drv or root or not tail or f.sep in tail or (f.altsep and f.altsep in tail):
raise ValueError("Invalid name %r" % (name)) raise ValueError("Invalid name %r" % (name))
return self._from_parsed_parts(self.drive, self.root, return self._from_parsed_parts(self.drive, self.root,
self._parts[:-1] + [name]) self._tail[:-1] + [name])
def with_stem(self, stem): def with_stem(self, stem):
"""Return a new path with the stem changed.""" """Return a new path with the stem changed."""
@ -526,7 +560,7 @@ class PurePath(object):
else: else:
name = name[:-len(old_suffix)] + suffix name = name[:-len(old_suffix)] + suffix
return self._from_parsed_parts(self.drive, self.root, return self._from_parsed_parts(self.drive, self.root,
self._parts[:-1] + [name]) self._tail[:-1] + [name])
def relative_to(self, other, /, *_deprecated, walk_up=False): def relative_to(self, other, /, *_deprecated, walk_up=False):
"""Return the relative path to another path identified by the passed """Return the relative path to another path identified by the passed
@ -551,7 +585,7 @@ class PurePath(object):
raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors") raise ValueError(f"{str(self)!r} and {str(other)!r} have different anchors")
if step and not walk_up: if step and not walk_up:
raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}") raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}")
parts = ('..',) * step + self.parts[len(path.parts):] parts = ['..'] * step + self._tail[len(path._tail):]
return path_cls(*parts) return path_cls(*parts)
def is_relative_to(self, other, /, *_deprecated): def is_relative_to(self, other, /, *_deprecated):
@ -570,13 +604,10 @@ class PurePath(object):
def parts(self): def parts(self):
"""An object providing sequence-like access to the """An object providing sequence-like access to the
components in the filesystem path.""" components in the filesystem path."""
# We cache the tuple to avoid building a new one each time .parts if self.drive or self.root:
# is accessed. XXX is this necessary? return (self.drive + self.root,) + tuple(self._tail)
try: else:
return self._parts_tuple return tuple(self._tail)
except AttributeError:
self._parts_tuple = tuple(self._parts)
return self._parts_tuple
def joinpath(self, *args): def joinpath(self, *args):
"""Combine this path with one or several arguments, and return a """Combine this path with one or several arguments, and return a
@ -603,10 +634,10 @@ class PurePath(object):
"""The logical parent of the path.""" """The logical parent of the path."""
drv = self.drive drv = self.drive
root = self.root root = self.root
parts = self._parts tail = self._tail
if len(parts) == 1 and (drv or root): if not tail:
return self return self
return self._from_parsed_parts(drv, root, parts[:-1]) return self._from_parsed_parts(drv, root, tail[:-1])
@property @property
def parents(self): def parents(self):
@ -624,29 +655,29 @@ class PurePath(object):
def is_reserved(self): def is_reserved(self):
"""Return True if the path contains one of the special names reserved """Return True if the path contains one of the special names reserved
by the system, if any.""" by the system, if any."""
if self._flavour is posixpath or not self._parts: if self._flavour is posixpath or not self._tail:
return False return False
# NOTE: the rules for reserved names seem somewhat complicated # NOTE: the rules for reserved names seem somewhat complicated
# (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not # (e.g. r"..\NUL" is reserved but not r"foo\NUL" if "foo" does not
# exist). We err on the side of caution and return True for paths # exist). We err on the side of caution and return True for paths
# which are not considered reserved by Windows. # which are not considered reserved by Windows.
if self._parts[0].startswith('\\\\'): if self.drive.startswith('\\\\'):
# UNC paths are never reserved. # UNC paths are never reserved.
return False return False
name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') name = self._tail[-1].partition('.')[0].partition(':')[0].rstrip(' ')
return name.upper() in _WIN_RESERVED_NAMES return name.upper() in _WIN_RESERVED_NAMES
def match(self, path_pattern): def match(self, path_pattern):
""" """
Return True if this path matches the given pattern. Return True if this path matches the given pattern.
""" """
path_pattern = self._flavour.normcase(path_pattern) pat = type(self)(path_pattern)
drv, root, pat_parts = self._parse_path(path_pattern) if not pat.parts:
if not pat_parts:
raise ValueError("empty pattern") raise ValueError("empty pattern")
pat_parts = pat._parts_normcase
parts = self._parts_normcase parts = self._parts_normcase
if drv or root: if pat.drive or pat.root:
if len(pat_parts) != len(parts): if len(pat_parts) != len(parts):
return False return False
elif len(pat_parts) > len(parts): elif len(pat_parts) > len(parts):
@ -707,11 +738,21 @@ class Path(PurePath):
cls = WindowsPath if os.name == 'nt' else PosixPath cls = WindowsPath if os.name == 'nt' else PosixPath
return object.__new__(cls) return object.__new__(cls)
def _make_child_relpath(self, part): def _make_child_relpath(self, name):
# This is an optimization used for dir walking. `part` must be path_str = str(self)
# a single part relative to this path. tail = self._tail
parts = self._parts + [part] if tail:
return self._from_parsed_parts(self.drive, self.root, parts) path_str = f'{path_str}{self._flavour.sep}{name}'
elif path_str != '.':
path_str = f'{path_str}{name}'
else:
path_str = name
path = type(self)(path_str)
path._str = path_str
path._drv = self.drive
path._root = self.root
path._tail_cached = tail + [name]
return path
def __enter__(self): def __enter__(self):
# In previous versions of pathlib, __exit__() marked this path as # In previous versions of pathlib, __exit__() marked this path as
@ -1196,12 +1237,12 @@ class Path(PurePath):
(as returned by os.path.expanduser) (as returned by os.path.expanduser)
""" """
if (not (self.drive or self.root) and if (not (self.drive or self.root) and
self._parts and self._parts[0][:1] == '~'): self._tail and self._tail[0][:1] == '~'):
homedir = self._flavour.expanduser(self._parts[0]) homedir = self._flavour.expanduser(self._tail[0])
if homedir[:1] == "~": if homedir[:1] == "~":
raise RuntimeError("Could not determine home directory.") raise RuntimeError("Could not determine home directory.")
drv, root, parts = self._parse_path(homedir) drv, root, tail = self._parse_path(homedir)
return self._from_parsed_parts(drv, root, parts + self._parts[1:]) return self._from_parsed_parts(drv, root, tail + self._tail[1:])
return self return self

View file

@ -346,8 +346,6 @@ class _BasePurePathTest(object):
p = P('a/b') p = P('a/b')
parts = p.parts parts = p.parts
self.assertEqual(parts, ('a', 'b')) self.assertEqual(parts, ('a', 'b'))
# The object gets reused.
self.assertIs(parts, p.parts)
# When the path is absolute, the anchor is a separate part. # When the path is absolute, the anchor is a separate part.
p = P('/a/b') p = P('/a/b')
parts = p.parts parts = p.parts

View file

@ -0,0 +1,2 @@
Speed up :class:`pathlib.Path` construction by omitting the path anchor from
the internal list of path parts.