gh-122288: Improve performances of `fnmatch.translate` (#122289)

Improve performance of this function by a factor of 1.7x. Co-authored-by: Barney Gale <barney.gale@gmail.com>
2024-11-27 17:42:45 +01:00 · 2024-11-27 17:42:45 +01:00 · 78cb377c62
parent 14a05a8f43
commit 78cb377c62
4 changed files with 112 additions and 47 deletions
--- a/Lib/fnmatch.py
+++ b/Lib/fnmatch.py
@ -77,24 +77,30 @@ def translate(pat):
    There is no way to quote meta-characters.
    """

-    STAR = object()
-    parts = _translate(pat, STAR, '.')
-    return _join_translated_parts(parts, STAR)
+    parts, star_indices = _translate(pat, '*', '.')
+    return _join_translated_parts(parts, star_indices)

+_re_setops_sub = re.compile(r'([&~|])').sub
+_re_escape = functools.lru_cache(maxsize=512)(re.escape)

-def _translate(pat, STAR, QUESTION_MARK):
+def _translate(pat, star, question_mark):
    res = []
    add = res.append
+    star_indices = []
+
    i, n = 0, len(pat)
    while i < n:
        c = pat[i]
        i = i+1
        if c == '*':
+            # store the position of the wildcard
+            star_indices.append(len(res))
+            add(star)
            # compress consecutive `*` into one
-            if (not res) or res[-1] is not STAR:
-                add(STAR)
+            while i < n and pat[i] == '*':
+                i += 1
        elif c == '?':
-            add(QUESTION_MARK)
+            add(question_mark)
        elif c == '[':
            j = i
            if j < n and pat[j] == '!':
@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
                    # Hyphens that create ranges shouldn't be escaped.
                    stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
                                     for s in chunks)
-                # Escape set operations (&&, ~~ and ||).
-                stuff = re.sub(r'([&~|])', r'\\\1', stuff)
                i = j+1
                if not stuff:
                    # Empty range: never match.
@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
                    # Negated empty range: match any character.
                    add('.')
                else:
+                    # Escape set operations (&&, ~~ and ||).
+                    stuff = _re_setops_sub(r'\\\1', stuff)
                    if stuff[0] == '!':
                        stuff = '^' + stuff[1:]
                    elif stuff[0] in ('^', '['):
                        stuff = '\\' + stuff
                    add(f'[{stuff}]')
        else:
-            add(re.escape(c))
+            add(_re_escape(c))
    assert i == n
-    return res
+    return res, star_indices


-def _join_translated_parts(inp, STAR):
-    # Deal with STARs.
-    res = []
-    add = res.append
-    i, n = 0, len(inp)
-    # Fixed pieces at the start?
-    while i < n and inp[i] is not STAR:
-        add(inp[i])
-        i += 1
-    # Now deal with STAR fixed STAR fixed ...
-    # For an interior `STAR fixed` pairing, we want to do a minimal
-    # .*? match followed by `fixed`, with no possibility of backtracking.
-    # Atomic groups ("(?>...)") allow us to spell that directly.
-    # Note: people rely on the undocumented ability to join multiple
-    # translate() results together via "|" to build large regexps matching
-    # "one of many" shell patterns.
-    while i < n:
-        assert inp[i] is STAR
-        i += 1
-        if i == n:
-            add(".*")
-            break
-        assert inp[i] is not STAR
-        fixed = []
-        while i < n and inp[i] is not STAR:
-            fixed.append(inp[i])
-            i += 1
-        fixed = "".join(fixed)
-        if i == n:
-            add(".*")
-            add(fixed)
-        else:
-            add(f"(?>.*?{fixed})")
-    assert i == n
-    res = "".join(res)
+def _join_translated_parts(parts, star_indices):
+    if not star_indices:
+        return fr'(?s:{"".join(parts)})\Z'
+    iter_star_indices = iter(star_indices)
+    j = next(iter_star_indices)
+    buffer = parts[:j]  # fixed pieces at the start
+    append, extend = buffer.append, buffer.extend
+    i = j + 1
+    for j in iter_star_indices:
+        # Now deal with STAR fixed STAR fixed ...
+        # For an interior `STAR fixed` pairing, we want to do a minimal
+        # .*? match followed by `fixed`, with no possibility of backtracking.
+        # Atomic groups ("(?>...)") allow us to spell that directly.
+        # Note: people rely on the undocumented ability to join multiple
+        # translate() results together via "|" to build large regexps matching
+        # "one of many" shell patterns.
+        append('(?>.*?')
+        extend(parts[i:j])
+        append(')')
+        i = j + 1
+    append('.*')
+    extend(parts[i:])
+    res = ''.join(buffer)
    return fr'(?s:{res})\Z'
--- a/Lib/glob.py
+++ b/Lib/glob.py
@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
            if part:
                if not include_hidden and part[0] in '*?':
                    results.append(r'(?!\.)')
-                results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
+                results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])
            if idx < last_part_idx:
                results.append(any_sep)
    res = ''.join(results)
--- a/Lib/test/test_fnmatch.py
+++ b/Lib/test/test_fnmatch.py
@ -250,6 +250,75 @@ def test_translate(self):
        self.assertTrue(re.match(fatre, 'cbabcaxc'))
        self.assertFalse(re.match(fatre, 'dabccbad'))

+    def test_translate_wildcards(self):
+        for pattern, expect in [
+            ('ab*', r'(?s:ab.*)\Z'),
+            ('ab*cd', r'(?s:ab.*cd)\Z'),
+            ('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
+            ('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
+            ('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
+            ('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
+            ('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
+        ]:
+            with self.subTest(pattern):
+                translated = translate(pattern)
+                self.assertEqual(translated, expect, pattern)
+
+        for pattern, expect in [
+            ('*ab', r'(?s:.*ab)\Z'),
+            ('*ab*', r'(?s:(?>.*?ab).*)\Z'),
+            ('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
+            ('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
+            ('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
+            ('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
+            ('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
+            ('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
+        ]:
+            with self.subTest(pattern):
+                translated = translate(pattern)
+                self.assertEqual(translated, expect, pattern)
+
+    def test_translate_expressions(self):
+        for pattern, expect in [
+            ('[', r'(?s:\[)\Z'),
+            ('[!', r'(?s:\[!)\Z'),
+            ('[]', r'(?s:\[\])\Z'),
+            ('[abc', r'(?s:\[abc)\Z'),
+            ('[!abc', r'(?s:\[!abc)\Z'),
+            ('[abc]', r'(?s:[abc])\Z'),
+            ('[!abc]', r'(?s:[^abc])\Z'),
+            ('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
+            # with [[
+            ('[[', r'(?s:\[\[)\Z'),
+            ('[[a', r'(?s:\[\[a)\Z'),
+            ('[[]', r'(?s:[\[])\Z'),
+            ('[[]a', r'(?s:[\[]a)\Z'),
+            ('[[]]', r'(?s:[\[]\])\Z'),
+            ('[[]a]', r'(?s:[\[]a\])\Z'),
+            ('[[a]', r'(?s:[\[a])\Z'),
+            ('[[a]]', r'(?s:[\[a]\])\Z'),
+            ('[[a]b', r'(?s:[\[a]b)\Z'),
+            # backslashes
+            ('[\\', r'(?s:\[\\)\Z'),
+            (r'[\]', r'(?s:[\\])\Z'),
+            (r'[\\]', r'(?s:[\\\\])\Z'),
+        ]:
+            with self.subTest(pattern):
+                translated = translate(pattern)
+                self.assertEqual(translated, expect, pattern)
+
+    def test_star_indices_locations(self):
+        from fnmatch import _translate
+
+        blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
+        parts, star_indices = _translate(''.join(blocks), '*', '.')
+        expect_parts = ['a', r'\^', 'b', '*',
+                        '.', '.', '[a-z]', '[1-9]', '*',
+                        r'\+', r'\+', r'\[', r'\[', 'a']
+        self.assertListEqual(parts, expect_parts)
+        self.assertListEqual(star_indices, [3, 8])
+
+
 class FilterTestCase(unittest.TestCase):

    def test_filter(self):
--- a/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst
+++ b/Misc/NEWS.d/next/Library/2024-07-25-18-06-51.gh-issue-122288.-_xxOR.rst
@ -0,0 +1,2 @@
+Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch
+by Bénédikt Tran.