Uh oh!
There was an error while loading. Please reload this page.
- Notifications
You must be signed in to change notification settings - Fork 33.9k
GH-115060: Speed up pathlib.Path.glob() by removing redundant regex matching#115061
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Uh oh!
There was an error while loading. Please reload this page.
Merged
Changes from all commits
Commits
Show all changes
10 commits Select commit Hold shift + click to select a range
54c5aa5 GH-115060: Speed up `pathlib.Path.glob()` by removing redundant regex…
barneygale 6abb80d Match against os.DirEntry.path in _select_recursive()
barneygale b382e40 Matching against dot-prefixed path is fine (and faster!)
barneygale e1472fc Revert "Matching against dot-prefixed path is fine (and faster!)"
barneygale 284c42e Skip computing prefix len when not matching
barneygale 169b1e7 Rename `prefix_len` --> `parent_len` for clarity.
barneygale 1c4184f Comments, naming.
barneygale 2873ed8 segment --> component
barneygale 90d5a12 Test post-`**` matching when globbing `.`.
barneygale a40924b Couple more test cases
barneygale File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Uh oh!
There was an error while loading. Please reload this page.
Jump to
Jump to file
Failed to load files.
Loading
Uh oh!
There was an error while loading. Please reload this page.
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -86,19 +86,29 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match): | ||
| continue | ||
| except OSError: | ||
| continue | ||
| if match(entry.name): | ||
| yield parent_path._make_child_entry(entry) | ||
| # Avoid cost of making a path object for non-matching paths by | ||
| # matching against the os.DirEntry.name string. | ||
| if match is None or match(entry.name): | ||
| yield parent_path._make_child_direntry(entry) | ||
| def _select_recursive(parent_paths, dir_only, follow_symlinks): | ||
| """Yield given paths and all their subdirectories, recursively.""" | ||
| def _select_recursive(parent_paths, dir_only, follow_symlinks, match): | ||
| """Yield given paths and all their children, recursively, filtering by | ||
| string and type. | ||
| """ | ||
| if follow_symlinks is None: | ||
| follow_symlinks = False | ||
| for parent_path in parent_paths: | ||
| if match is not None: | ||
| # If we're filtering paths through a regex, record the length of | ||
| # the parent path. We'll pass it to match(path, pos=...) later. | ||
| parent_len = len(str(parent_path._make_child_relpath('_'))) - 1 | ||
| paths = [parent_path._make_child_relpath('')] | ||
| while paths: | ||
| path = paths.pop() | ||
| yield path | ||
| if match is None or match(str(path), parent_len): | ||
| # Yield *directory* path that matches pattern (if any). | ||
| yield path | ||
| try: | ||
| # We must close the scandir() object before proceeding to | ||
| # avoid exhausting file descriptors when globbing deep trees. | ||
| @@ -108,14 +118,22 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks): | ||
| pass | ||
| else: | ||
| for entry in entries: | ||
| # Handle directory entry. | ||
| try: | ||
| if entry.is_dir(follow_symlinks=follow_symlinks): | ||
| paths.append(path._make_child_entry(entry)) | ||
| # Recurse into this directory. | ||
| paths.append(path._make_child_direntry(entry)) | ||
| continue | ||
| except OSError: | ||
| pass | ||
| # Handle file entry. | ||
| if not dir_only: | ||
| yield path._make_child_entry(entry) | ||
| # Avoid cost of making a path object for non-matching | ||
| # files by matching against the os.DirEntry object. | ||
| if match is None or match(path._direntry_str(entry), parent_len): | ||
| # Yield *file* path that matches pattern (if any). | ||
| yield path._make_child_direntry(entry) | ||
| def _select_unique(paths): | ||
| @@ -750,8 +768,14 @@ def _scandir(self): | ||
| from contextlib import nullcontext | ||
| return nullcontext(self.iterdir()) | ||
| def _make_child_entry(self, entry): | ||
| def _direntry_str(self, entry): | ||
| # Transform an entry yielded from _scandir() into a path string. | ||
| # PathBase._scandir() yields PathBase objects, so use str(). | ||
| return str(entry) | ||
| def _make_child_direntry(self, entry): | ||
| # Transform an entry yielded from _scandir() into a path object. | ||
| # PathBase._scandir() yields PathBase objects, so this is a no-op. | ||
| return entry | ||
| def _make_child_relpath(self, name): | ||
| @@ -769,43 +793,49 @@ def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None): | ||
| stack = pattern._pattern_stack | ||
| specials = ('', '.', '..') | ||
| filter_paths = False | ||
| deduplicate_paths = False | ||
| sep = self.pathmod.sep | ||
| paths = iter([self] if self.is_dir() else []) | ||
| while stack: | ||
| part = stack.pop() | ||
| if part in specials: | ||
| # Join special component (e.g. '..') onto paths. | ||
| paths = _select_special(paths, part) | ||
| elif part == '**': | ||
| # Consume adjacent '**' components. | ||
| # Consume following '**' components, which have no effect. | ||
| while stack and stack[-1] == '**': | ||
| stack.pop() | ||
| # Consume adjacent non-special components and enable post-walk | ||
| # regex filtering, provided we're treating symlinks consistently. | ||
| # Consume following non-special components, provided we're | ||
| # treating symlinks consistently. Each component is joined | ||
| # onto 'part', which is used to generate an re.Pattern object. | ||
| if follow_symlinks is not None: | ||
| while stack and stack[-1] not in specials: | ||
| filter_paths = True | ||
| stack.pop() | ||
| part += sep + stack.pop() | ||
| dir_only = bool(stack) | ||
| paths = _select_recursive(paths, dir_only, follow_symlinks) | ||
| # If the previous loop consumed pattern components, compile an | ||
| # re.Pattern object based on those components. | ||
| match = _compile_pattern(part, sep, case_sensitive) if part != '**' else None | ||
barneygale marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| # Recursively walk directories, filtering by type and regex. | ||
| paths = _select_recursive(paths, bool(stack), follow_symlinks, match) | ||
| # De-duplicate if we've already seen a '**' component. | ||
| if deduplicate_paths: | ||
| # De-duplicate if we've already seen a '**' component. | ||
| paths = _select_unique(paths) | ||
| deduplicate_paths = True | ||
| elif '**' in part: | ||
| raise ValueError("Invalid pattern: '**' can only be an entire path component") | ||
| else: | ||
| dir_only = bool(stack) | ||
| match = _compile_pattern(part, sep, case_sensitive) | ||
| paths = _select_children(paths, dir_only, follow_symlinks, match) | ||
| if filter_paths: | ||
| # Filter out paths that don't match pattern. | ||
| prefix_len = len(str(self._make_child_relpath('_'))) - 1 | ||
| match = _compile_pattern(pattern._pattern_str, sep, case_sensitive) | ||
| paths = (path for path in paths if match(path._pattern_str, prefix_len)) | ||
| # If the pattern component isn't '*', compile an re.Pattern | ||
| # object based on the component. | ||
| match = _compile_pattern(part, sep, case_sensitive) if part != '*' else None | ||
barneygale marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading. Please reload this page. | ||
| # Iterate over directories' children filtering by type and regex. | ||
| paths = _select_children(paths, bool(stack), follow_symlinks, match) | ||
| return paths | ||
| def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None): | ||
| @@ -854,7 +884,7 @@ def walk(self, top_down=True, on_error=None, follow_symlinks=False): | ||
| if is_dir: | ||
| if not top_down: | ||
| paths.append(path._make_child_entry(entry)) | ||
| paths.append(path._make_child_direntry(entry)) | ||
| dirnames.append(entry.name) | ||
| else: | ||
| filenames.append(entry.name) | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
1 change: 1 addition & 0 deletions 1 Misc/NEWS.d/next/Library/2024-02-06-03-55-46.gh-issue-115060.EkWRpP.rst
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| Speed up :meth:`pathlib.Path.glob` by removing redundant regex matching. |
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.