mirror of https://github.com/python/cpython.git
gh-122288: Improve performances of `fnmatch.translate` (#122289)
Improve performance of this function by a factor of 1.7x. Co-authored-by: Barney Gale <barney.gale@gmail.com>
This commit is contained in:
parent
14a05a8f43
commit
78cb377c62
|
@ -77,24 +77,30 @@ def translate(pat):
|
|||
There is no way to quote meta-characters.
|
||||
"""
|
||||
|
||||
STAR = object()
|
||||
parts = _translate(pat, STAR, '.')
|
||||
return _join_translated_parts(parts, STAR)
|
||||
parts, star_indices = _translate(pat, '*', '.')
|
||||
return _join_translated_parts(parts, star_indices)
|
||||
|
||||
_re_setops_sub = re.compile(r'([&~|])').sub
|
||||
_re_escape = functools.lru_cache(maxsize=512)(re.escape)
|
||||
|
||||
def _translate(pat, STAR, QUESTION_MARK):
|
||||
def _translate(pat, star, question_mark):
|
||||
res = []
|
||||
add = res.append
|
||||
star_indices = []
|
||||
|
||||
i, n = 0, len(pat)
|
||||
while i < n:
|
||||
c = pat[i]
|
||||
i = i+1
|
||||
if c == '*':
|
||||
# store the position of the wildcard
|
||||
star_indices.append(len(res))
|
||||
add(star)
|
||||
# compress consecutive `*` into one
|
||||
if (not res) or res[-1] is not STAR:
|
||||
add(STAR)
|
||||
while i < n and pat[i] == '*':
|
||||
i += 1
|
||||
elif c == '?':
|
||||
add(QUESTION_MARK)
|
||||
add(question_mark)
|
||||
elif c == '[':
|
||||
j = i
|
||||
if j < n and pat[j] == '!':
|
||||
|
@ -133,8 +139,6 @@ def _translate(pat, STAR, QUESTION_MARK):
|
|||
# Hyphens that create ranges shouldn't be escaped.
|
||||
stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-')
|
||||
for s in chunks)
|
||||
# Escape set operations (&&, ~~ and ||).
|
||||
stuff = re.sub(r'([&~|])', r'\\\1', stuff)
|
||||
i = j+1
|
||||
if not stuff:
|
||||
# Empty range: never match.
|
||||
|
@ -143,50 +147,40 @@ def _translate(pat, STAR, QUESTION_MARK):
|
|||
# Negated empty range: match any character.
|
||||
add('.')
|
||||
else:
|
||||
# Escape set operations (&&, ~~ and ||).
|
||||
stuff = _re_setops_sub(r'\\\1', stuff)
|
||||
if stuff[0] == '!':
|
||||
stuff = '^' + stuff[1:]
|
||||
elif stuff[0] in ('^', '['):
|
||||
stuff = '\\' + stuff
|
||||
add(f'[{stuff}]')
|
||||
else:
|
||||
add(re.escape(c))
|
||||
add(_re_escape(c))
|
||||
assert i == n
|
||||
return res
|
||||
return res, star_indices
|
||||
|
||||
|
||||
def _join_translated_parts(inp, STAR):
|
||||
# Deal with STARs.
|
||||
res = []
|
||||
add = res.append
|
||||
i, n = 0, len(inp)
|
||||
# Fixed pieces at the start?
|
||||
while i < n and inp[i] is not STAR:
|
||||
add(inp[i])
|
||||
i += 1
|
||||
# Now deal with STAR fixed STAR fixed ...
|
||||
# For an interior `STAR fixed` pairing, we want to do a minimal
|
||||
# .*? match followed by `fixed`, with no possibility of backtracking.
|
||||
# Atomic groups ("(?>...)") allow us to spell that directly.
|
||||
# Note: people rely on the undocumented ability to join multiple
|
||||
# translate() results together via "|" to build large regexps matching
|
||||
# "one of many" shell patterns.
|
||||
while i < n:
|
||||
assert inp[i] is STAR
|
||||
i += 1
|
||||
if i == n:
|
||||
add(".*")
|
||||
break
|
||||
assert inp[i] is not STAR
|
||||
fixed = []
|
||||
while i < n and inp[i] is not STAR:
|
||||
fixed.append(inp[i])
|
||||
i += 1
|
||||
fixed = "".join(fixed)
|
||||
if i == n:
|
||||
add(".*")
|
||||
add(fixed)
|
||||
else:
|
||||
add(f"(?>.*?{fixed})")
|
||||
assert i == n
|
||||
res = "".join(res)
|
||||
def _join_translated_parts(parts, star_indices):
|
||||
if not star_indices:
|
||||
return fr'(?s:{"".join(parts)})\Z'
|
||||
iter_star_indices = iter(star_indices)
|
||||
j = next(iter_star_indices)
|
||||
buffer = parts[:j] # fixed pieces at the start
|
||||
append, extend = buffer.append, buffer.extend
|
||||
i = j + 1
|
||||
for j in iter_star_indices:
|
||||
# Now deal with STAR fixed STAR fixed ...
|
||||
# For an interior `STAR fixed` pairing, we want to do a minimal
|
||||
# .*? match followed by `fixed`, with no possibility of backtracking.
|
||||
# Atomic groups ("(?>...)") allow us to spell that directly.
|
||||
# Note: people rely on the undocumented ability to join multiple
|
||||
# translate() results together via "|" to build large regexps matching
|
||||
# "one of many" shell patterns.
|
||||
append('(?>.*?')
|
||||
extend(parts[i:j])
|
||||
append(')')
|
||||
i = j + 1
|
||||
append('.*')
|
||||
extend(parts[i:])
|
||||
res = ''.join(buffer)
|
||||
return fr'(?s:{res})\Z'
|
||||
|
|
|
@ -312,7 +312,7 @@ def translate(pat, *, recursive=False, include_hidden=False, seps=None):
|
|||
if part:
|
||||
if not include_hidden and part[0] in '*?':
|
||||
results.append(r'(?!\.)')
|
||||
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
|
||||
results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep)[0])
|
||||
if idx < last_part_idx:
|
||||
results.append(any_sep)
|
||||
res = ''.join(results)
|
||||
|
|
|
@ -250,6 +250,75 @@ def test_translate(self):
|
|||
self.assertTrue(re.match(fatre, 'cbabcaxc'))
|
||||
self.assertFalse(re.match(fatre, 'dabccbad'))
|
||||
|
||||
def test_translate_wildcards(self):
|
||||
for pattern, expect in [
|
||||
('ab*', r'(?s:ab.*)\Z'),
|
||||
('ab*cd', r'(?s:ab.*cd)\Z'),
|
||||
('ab*cd*', r'(?s:ab(?>.*?cd).*)\Z'),
|
||||
('ab*cd*12', r'(?s:ab(?>.*?cd).*12)\Z'),
|
||||
('ab*cd*12*', r'(?s:ab(?>.*?cd)(?>.*?12).*)\Z'),
|
||||
('ab*cd*12*34', r'(?s:ab(?>.*?cd)(?>.*?12).*34)\Z'),
|
||||
('ab*cd*12*34*', r'(?s:ab(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
|
||||
]:
|
||||
with self.subTest(pattern):
|
||||
translated = translate(pattern)
|
||||
self.assertEqual(translated, expect, pattern)
|
||||
|
||||
for pattern, expect in [
|
||||
('*ab', r'(?s:.*ab)\Z'),
|
||||
('*ab*', r'(?s:(?>.*?ab).*)\Z'),
|
||||
('*ab*cd', r'(?s:(?>.*?ab).*cd)\Z'),
|
||||
('*ab*cd*', r'(?s:(?>.*?ab)(?>.*?cd).*)\Z'),
|
||||
('*ab*cd*12', r'(?s:(?>.*?ab)(?>.*?cd).*12)\Z'),
|
||||
('*ab*cd*12*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*)\Z'),
|
||||
('*ab*cd*12*34', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12).*34)\Z'),
|
||||
('*ab*cd*12*34*', r'(?s:(?>.*?ab)(?>.*?cd)(?>.*?12)(?>.*?34).*)\Z'),
|
||||
]:
|
||||
with self.subTest(pattern):
|
||||
translated = translate(pattern)
|
||||
self.assertEqual(translated, expect, pattern)
|
||||
|
||||
def test_translate_expressions(self):
|
||||
for pattern, expect in [
|
||||
('[', r'(?s:\[)\Z'),
|
||||
('[!', r'(?s:\[!)\Z'),
|
||||
('[]', r'(?s:\[\])\Z'),
|
||||
('[abc', r'(?s:\[abc)\Z'),
|
||||
('[!abc', r'(?s:\[!abc)\Z'),
|
||||
('[abc]', r'(?s:[abc])\Z'),
|
||||
('[!abc]', r'(?s:[^abc])\Z'),
|
||||
('[!abc][!def]', r'(?s:[^abc][^def])\Z'),
|
||||
# with [[
|
||||
('[[', r'(?s:\[\[)\Z'),
|
||||
('[[a', r'(?s:\[\[a)\Z'),
|
||||
('[[]', r'(?s:[\[])\Z'),
|
||||
('[[]a', r'(?s:[\[]a)\Z'),
|
||||
('[[]]', r'(?s:[\[]\])\Z'),
|
||||
('[[]a]', r'(?s:[\[]a\])\Z'),
|
||||
('[[a]', r'(?s:[\[a])\Z'),
|
||||
('[[a]]', r'(?s:[\[a]\])\Z'),
|
||||
('[[a]b', r'(?s:[\[a]b)\Z'),
|
||||
# backslashes
|
||||
('[\\', r'(?s:\[\\)\Z'),
|
||||
(r'[\]', r'(?s:[\\])\Z'),
|
||||
(r'[\\]', r'(?s:[\\\\])\Z'),
|
||||
]:
|
||||
with self.subTest(pattern):
|
||||
translated = translate(pattern)
|
||||
self.assertEqual(translated, expect, pattern)
|
||||
|
||||
def test_star_indices_locations(self):
|
||||
from fnmatch import _translate
|
||||
|
||||
blocks = ['a^b', '***', '?', '?', '[a-z]', '[1-9]', '*', '++', '[[a']
|
||||
parts, star_indices = _translate(''.join(blocks), '*', '.')
|
||||
expect_parts = ['a', r'\^', 'b', '*',
|
||||
'.', '.', '[a-z]', '[1-9]', '*',
|
||||
r'\+', r'\+', r'\[', r'\[', 'a']
|
||||
self.assertListEqual(parts, expect_parts)
|
||||
self.assertListEqual(star_indices, [3, 8])
|
||||
|
||||
|
||||
class FilterTestCase(unittest.TestCase):
|
||||
|
||||
def test_filter(self):
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Improve the performances of :func:`fnmatch.translate` by a factor 1.7. Patch
|
||||
by Bénédikt Tran.
|
Loading…
Reference in New Issue