gh-122288: Improve performances of `fnmatch.translate` (#122289)

Improve performance of this function by a factor of 1.7x.

Co-authored-by: Barney Gale <barney.gale@gmail.com>
This commit is contained in:
Bénédikt Tran 2024-11-27 17:42:45 +01:00 committed by GitHub
parent 14a05a8f43
commit 78cb377c62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 112 additions and 47 deletions

View File

@ -77,24 +77,30 @@ def translate(pat):
There is no way to quote meta-characters.
"""
STAR = object()
parts = _translate(pat, STAR, '.')
return _join_translated_parts(parts, STAR)
parts, star_indices = _translate(pat, '*', '.')
return _join_translated_parts(parts, star_indices)
_re_setops_sub = re.compile(r'([&~|])').sub
_re_escape = functools.lru_cache(maxsize=512)(re.escape)
def _translate(pat, STAR, QUESTION_MARK):
def _translate(pat, star, question_mark):
res = []
add = res.append
star_indices = []
i, n = 0, len(pat)
while i < n:
c = pat[i]
i = i+1
if c == '*':
# store the position of the wildcard
star_indices.append(len(res))
add(star)
# compress consecutive `*` into one
if (not res) or res[-1] is not STAR:
add(STAR)
while i < n and pat[i] == '*':
i += 1
elif c == '?':
add(QUESTION_MARK)
add(question_mark)
elif c == '[':
j = i
if j < n and pat[j] == '!':
@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
# Hyphens that create ranges shouldn't be escaped.
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
for s in chunks)
# Escape set operations (&&, ~~ and ||).
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
i = j+1
if not stuff:
# Empty range: never match.
@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
# Negated empty range: match any character.
add('.')
else:
# Escape set operations (&&, ~~ and ||).
stuff = _re_setops_sub(r'\\\1', stuff)
if stuff[0] == '!':
stuff = '^' + stuff[1:]
elif stuff[0] in ('^', '['):
stuff = '\\' + stuff
add(f'[{stuff}]')
else:
add(re.escape(c))
add(_re_escape(c))
assert i == n
return res
return res, star_indices
def _join_translated_parts(inp, STAR):
# Deal with STARs.
res = []
add = res.append
i, n = 0, len(inp)
# Fixed pieces at the start?
while i < n and inp[i] is not STAR:
add(inp[i])
i += 1
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
while i < n:
assert inp[i] is STAR
i += 1
if i == n:
add(".*")
break
assert inp[i] is not STAR
fixed = []
while i < n and inp[i] is not STAR:
fixed.append(inp[i])
i += 1
fixed = "".join(fixed)
if i == n:
add(".*")
add(fixed)
else:
add(f"(?>.*?{fixed})")
assert i == n
res = "".join(res)
def _join_translated_parts(parts, star_indices):
if not star_indices:
return fr'(?s:{"".join(parts)})\Z'
iter_star_indices = iter(star_indices)
j = next(iter_star_indices)
buffer = parts[:j] # fixed pieces at the start
append, extend = buffer.append, buffer.extend
i = j + 1
for j in iter_star_indices:
# Now deal with STAR fixed STAR fixed ...
# For an interior `STAR fixed` pairing, we want to do a minimal
# .*? match followed by `fixed`, with no possibility of backtracking.
# Atomic groups ("(?>...)") allow us to spell that directly.
# Note: people rely on the undocumented ability to join multiple
# translate() results together via "|" to build large regexps matching
# "one of many" shell patterns.
append('(?>.*?')
extend(parts[i:j])
append(')')
i = j + 1
append('.*')
extend(parts[i:])
res = ''.join(buffer)
return fr'(?s:{res})\Z'

View File

@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
if part:
if not include_hidden and part[0] in '*?':
results.append(r'(?!\.)')
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])
if idx < last_part_idx:
results.append(any_sep)
res = ''.join(results)

View File

@ -250,6 +250,75 @@ def test_translate(self):
self.assertTrue(re.match(fatre, 'cbabcaxc'))
self.assertFalse(re.match(fatre, 'dabccbad'))
def test_translate_wildcards(self):
for pattern, expect in [
('ab*', r'(?s:ab.*)\Z'),
('ab*cd', r'(?s:ab.*cd)\Z'),
('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
]:
with self.subTest(pattern):
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)
for pattern, expect in [
('*ab', r'(?s:.*ab)\Z'),
('*ab*', r'(?s:(?>.*?ab).*)\Z'),
('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
]:
with self.subTest(pattern):
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)
def test_translate_expressions(self):
for pattern, expect in [
('[', r'(?s:\[)\Z'),
('[!', r'(?s:\[!)\Z'),
('[]', r'(?s:\[\])\Z'),
('[abc', r'(?s:\[abc)\Z'),
('[!abc', r'(?s:\[!abc)\Z'),
('[abc]', r'(?s:[abc])\Z'),
('[!abc]', r'(?s:[^abc])\Z'),
('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
# with [[
('[[', r'(?s:\[\[)\Z'),
('[[a', r'(?s:\[\[a)\Z'),
('[[]', r'(?s:[\[])\Z'),
('[[]a', r'(?s:[\[]a)\Z'),
('[[]]', r'(?s:[\[]\])\Z'),
('[[]a]', r'(?s:[\[]a\])\Z'),
('[[a]', r'(?s:[\[a])\Z'),
('[[a]]', r'(?s:[\[a]\])\Z'),
('[[a]b', r'(?s:[\[a]b)\Z'),
# backslashes
('[\\', r'(?s:\[\\)\Z'),
(r'[\]', r'(?s:[\\])\Z'),
(r'[\\]', r'(?s:[\\\\])\Z'),
]:
with self.subTest(pattern):
translated = translate(pattern)
self.assertEqual(translated, expect, pattern)
def test_star_indices_locations(self):
from fnmatch import _translate
blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
parts, star_indices = _translate(''.join(blocks), '*', '.')
expect_parts = ['a', r'\^', 'b', '*',
'.', '.', '[a-z]', '[1-9]', '*',
r'\+', r'\+', r'\[', r'\[', 'a']
self.assertListEqual(parts, expect_parts)
self.assertListEqual(star_indices, [3, 8])
class FilterTestCase(unittest.TestCase):
def test_filter(self):

View File

@ -0,0 +1,2 @@
Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch
by Bénédikt Tran.