From 78cb377c622a98b1bf58df40c828e886575a6927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9n=C3=A9dikt=20Tran?= <10796600+picnixz@users.noreply.github.com> Date: Wed, 27 Nov 2024 17:42:45 +0100 Subject: [PATCH] gh-122288: Improve performances of `fnmatch.translate` (#122289) Improve performance of this function by a factor of 1.7x. Co-authored-by: Barney Gale --- Lib/fnmatch.py | 86 +++++++++---------- Lib/glob.py | 2 +- Lib/test/test_fnmatch.py | 69 +++++++++++++++ ...-07-25-18-06-51.gh-issue-122288.-_xxOR.rst | 2 + 4 files changed, 112 insertions(+), 47 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 73acb1fe8d4..865baea2346 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,24 +77,30 @@ def translate(pat): There is no way to quote meta-characters. """ - STAR = object() - parts = _translate(pat, STAR, '.') - return _join_translated_parts(parts, STAR) + parts, star_indices = _translate(pat, '*', '.') + return _join_translated_parts(parts, star_indices) +_re_setops_sub = re.compile(r'([&~|])').sub +_re_escape = functools.lru_cache(maxsize=512)(re.escape) -def _translate(pat, STAR, QUESTION_MARK): +def _translate(pat, star, question_mark): res = [] add = res.append + star_indices = [] + i, n = 0, len(pat) while i < n: c = pat[i] i = i+1 if c == '*': + # store the position of the wildcard + star_indices.append(len(res)) + add(star) # compress consecutive `*` into one - if (not res) or res[-1] is not STAR: - add(STAR) + while i < n and pat[i] == '*': + i += 1 elif c == '?': - add(QUESTION_MARK) + add(question_mark) elif c == '[': j = i if j < n and pat[j] == '!': @@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK): # Hyphens that create ranges shouldn't be escaped. stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks) - # Escape set operations (&&, ~~ and ||). - stuff = re.sub(r'([&~|])', r'\\\1', stuff) i = j+1 if not stuff: # Empty range: never match. @@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK): # Negated empty range: match any character. add('.') else: + # Escape set operations (&&, ~~ and ||). + stuff = _re_setops_sub(r'\\\1', stuff) if stuff[0] == '!': stuff = '^' + stuff[1:] elif stuff[0] in ('^', '['): stuff = '\\' + stuff add(f'[{stuff}]') else: - add(re.escape(c)) + add(_re_escape(c)) assert i == n - return res + return res, star_indices -def _join_translated_parts(inp, STAR): - # Deal with STARs. - res = [] - add = res.append - i, n = 0, len(inp) - # Fixed pieces at the start? - while i < n and inp[i] is not STAR: - add(inp[i]) - i += 1 - # Now deal with STAR fixed STAR fixed ... - # For an interior `STAR fixed` pairing, we want to do a minimal - # .*? match followed by `fixed`, with no possibility of backtracking. - # Atomic groups ("(?>...)") allow us to spell that directly. - # Note: people rely on the undocumented ability to join multiple - # translate() results together via "|" to build large regexps matching - # "one of many" shell patterns. - while i < n: - assert inp[i] is STAR - i += 1 - if i == n: - add(".*") - break - assert inp[i] is not STAR - fixed = [] - while i < n and inp[i] is not STAR: - fixed.append(inp[i]) - i += 1 - fixed = "".join(fixed) - if i == n: - add(".*") - add(fixed) - else: - add(f"(?>.*?{fixed})") - assert i == n - res = "".join(res) +def _join_translated_parts(parts, star_indices): + if not star_indices: + return fr'(?s:{"".join(parts)})\Z' + iter_star_indices = iter(star_indices) + j = next(iter_star_indices) + buffer = parts[:j] # fixed pieces at the start + append, extend = buffer.append, buffer.extend + i = j + 1 + for j in iter_star_indices: + # Now deal with STAR fixed STAR fixed ... + # For an interior `STAR fixed` pairing, we want to do a minimal + # .*? match followed by `fixed`, with no possibility of backtracking. + # Atomic groups ("(?>...)") allow us to spell that directly. + # Note: people rely on the undocumented ability to join multiple + # translate() results together via "|" to build large regexps matching + # "one of many" shell patterns. + append('(?>.*?') + extend(parts[i:j]) + append(')') + i = j + 1 + append('.*') + extend(parts[i:]) + res = ''.join(buffer) return fr'(?s:{res})\Z' diff --git a/Lib/glob.py b/Lib/glob.py index ce9b3698888..690ab1b8b9f 100644 --- a/Lib/glob.py +++ b/Lib/glob.py @@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None): if part: if not include_hidden and part[0] in '*?': results.append(r'(?!\.)') - results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)) + results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0]) if idx < last_part_idx: results.append(any_sep) res = ''.join(results) diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index 10ed496d4e2..9f360e1dc10 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -250,6 +250,75 @@ def test_translate(self): self.assertTrue(re.match(fatre, 'cbabcaxc')) self.assertFalse(re.match(fatre, 'dabccbad')) + def test_translate_wildcards(self): + for pattern, expect in [ + ('ab*', r'(?s:ab.*)\Z'), + ('ab*cd', r'(?s:ab.*cd)\Z'), + ('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'), + ('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'), + ('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'), + ('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'), + ('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), + ]: + with self.subTest(pattern): + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) + + for pattern, expect in [ + ('*ab', r'(?s:.*ab)\Z'), + ('*ab*', r'(?s:(?>.*?ab).*)\Z'), + ('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'), + ('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'), + ('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'), + ('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'), + ('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'), + ('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'), + ]: + with self.subTest(pattern): + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) + + def test_translate_expressions(self): + for pattern, expect in [ + ('[', r'(?s:\[)\Z'), + ('[!', r'(?s:\[!)\Z'), + ('[]', r'(?s:\[\])\Z'), + ('[abc', r'(?s:\[abc)\Z'), + ('[!abc', r'(?s:\[!abc)\Z'), + ('[abc]', r'(?s:[abc])\Z'), + ('[!abc]', r'(?s:[^abc])\Z'), + ('[!abc][!def]', r'(?s:[^abc][^def])\Z'), + # with [[ + ('[[', r'(?s:\[\[)\Z'), + ('[[a', r'(?s:\[\[a)\Z'), + ('[[]', r'(?s:[\[])\Z'), + ('[[]a', r'(?s:[\[]a)\Z'), + ('[[]]', r'(?s:[\[]\])\Z'), + ('[[]a]', r'(?s:[\[]a\])\Z'), + ('[[a]', r'(?s:[\[a])\Z'), + ('[[a]]', r'(?s:[\[a]\])\Z'), + ('[[a]b', r'(?s:[\[a]b)\Z'), + # backslashes + ('[\\', r'(?s:\[\\)\Z'), + (r'[\]', r'(?s:[\\])\Z'), + (r'[\\]', r'(?s:[\\\\])\Z'), + ]: + with self.subTest(pattern): + translated = translate(pattern) + self.assertEqual(translated, expect, pattern) + + def test_star_indices_locations(self): + from fnmatch import _translate + + blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a'] + parts, star_indices = _translate(''.join(blocks), '*', '.') + expect_parts = ['a', r'\^', 'b', '*', + '.', '.', '[a-z]', '[1-9]', '*', + r'\+', r'\+', r'\[', r'\[', 'a'] + self.assertListEqual(parts, expect_parts) + self.assertListEqual(star_indices, [3, 8]) + + class FilterTestCase(unittest.TestCase): def test_filter(self): diff --git a/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst new file mode 100644 index 00000000000..26a18afca94 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst @@ -0,0 +1,2 @@ +Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch +by Bénédikt Tran.