From 3d4fe934a1cda2e87438aa41acfa89eb9d669b6c Mon Sep 17 00:00:00 2001 From: CPython Developers <> Date: Tue, 16 Aug 2022 09:05:29 +0900 Subject: [PATCH] Update fnmatch from CPython 3.10 --- Lib/fnmatch.py | 104 ++++++++++++++++++++++----- Lib/test/test_fnmatch.py | 151 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 236 insertions(+), 19 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index af0dbcd092..fee59bf73f 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -9,16 +9,19 @@ expression. They cache the compiled regular expressions for speed. The function translate(PATTERN) returns a regular expression corresponding to PATTERN. (It does not compile it.) """ -try: - import os -except ImportError: - import _dummy_os as os +import os import posixpath import re import functools __all__ = ["filter", "fnmatch", "fnmatchcase", "translate"] +# Build a thread-safe incrementing counter to help create unique regexp group +# names across calls. +from itertools import count +_nextgroupnum = count().__next__ +del count + def fnmatch(name, pat): """Test whether FILENAME matches PATTERN. @@ -49,7 +52,7 @@ def _compile_pattern(pat): return re.compile(res).match def filter(names, pat): - """Return the subset of the list NAMES that match PAT.""" + """Construct a list from those elements of the iterable NAMES that match PAT.""" result = [] pat = os.path.normcase(pat) match = _compile_pattern(pat) @@ -80,15 +83,19 @@ def translate(pat): There is no way to quote meta-characters. """ + STAR = object() + res = [] + add = res.append i, n = 0, len(pat) - res = '' while i < n: c = pat[i] i = i+1 if c == '*': - res = res + '.*' + # compress consecutive `*` into one + if (not res) or res[-1] is not STAR: + add(STAR) elif c == '?': - res = res + '.' + add('.') elif c == '[': j = i if j < n and pat[j] == '!': @@ -98,10 +105,10 @@ def translate(pat): while j < n and pat[j] != ']': j = j+1 if j >= n: - res = res + '\\[' + add('\\[') else: stuff = pat[i:j] - if '--' not in stuff: + if '-' not in stuff: stuff = stuff.replace('\\', r'\\') else: chunks = [] @@ -113,7 +120,16 @@ def translate(pat): chunks.append(pat[i:k]) i = k+1 k = k+3 - chunks.append(pat[i:j]) + chunk = pat[i:j] + if chunk: + chunks.append(chunk) + else: + chunks[-1] += '-' + # Remove empty ranges -- invalid in RE. + for k in range(len(chunks)-1, 0, -1): + if chunks[k-1][-1] > chunks[k][0]: + chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] + del chunks[k] # Escape backslashes and hyphens for set difference (--). # Hyphens that create ranges shouldn't be escaped. stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') @@ -121,11 +137,63 @@ def translate(pat): # Escape set operations (&&, ~~ and ||). stuff = re.sub(r'([&~|])', r'\\\1', stuff) i = j+1 - if stuff[0] == '!': - stuff = '^' + stuff[1:] - elif stuff[0] in ('^', '['): - stuff = '\\' + stuff - res = '%s[%s]' % (res, stuff) + if not stuff: + # Empty range: never match. + add('(?!)') + elif stuff == '!': + # Negated empty range: match any character. + add('.') + else: + if stuff[0] == '!': + stuff = '^' + stuff[1:] + elif stuff[0] in ('^', '['): + stuff = '\\' + stuff + add(f'[{stuff}]') else: - res = res + re.escape(c) - return r'(?s:%s)\Z' % res + add(re.escape(c)) + assert i == n + + # Deal with STARs. + inp = res + res = [] + add = res.append + i, n = 0, len(inp) + # Fixed pieces at the start? + while i < n and inp[i] is not STAR: + add(inp[i]) + i += 1 + # Now deal with STAR fixed STAR fixed ... + # For an interior `STAR fixed` pairing, we want to do a minimal + # .*? match followed by `fixed`, with no possibility of backtracking. + # We can't spell that directly, but can trick it into working by matching + # .*?fixed + # in a lookahead assertion, save the matched part in a group, then + # consume that group via a backreference. If the overall match fails, + # the lookahead assertion won't try alternatives. So the translation is: + # (?=(?P.*?fixed))(?P=name) + # Group names are created as needed: g0, g1, g2, ... + # The numbers are obtained from _nextgroupnum() to ensure they're unique + # across calls and across threads. This is because people rely on the + # undocumented ability to join multiple translate() results together via + # "|" to build large regexps matching "one of many" shell patterns. + while i < n: + assert inp[i] is STAR + i += 1 + if i == n: + add(".*") + break + assert inp[i] is not STAR + fixed = [] + while i < n and inp[i] is not STAR: + fixed.append(inp[i]) + i += 1 + fixed = "".join(fixed) + if i == n: + add(".*") + add(fixed) + else: + groupnum = _nextgroupnum() + add(f"(?=(?P.*?{fixed}))(?P=g{groupnum})") + assert i == n + res = "".join(res) + return fr'(?s:{res})\Z' diff --git a/Lib/test/test_fnmatch.py b/Lib/test/test_fnmatch.py index d4d76d298f..04365341d8 100644 --- a/Lib/test/test_fnmatch.py +++ b/Lib/test/test_fnmatch.py @@ -2,6 +2,7 @@ import unittest import os +import string import warnings from fnmatch import fnmatch, fnmatchcase, translate, filter @@ -45,6 +46,13 @@ class FnmatchTestCase(unittest.TestCase): check('\nfoo', 'foo*', False) check('\n', '*') + def test_slow_fnmatch(self): + check = self.check_match + check('a' * 50, '*a*a*a*a*a*a*a*a*a*a') + # The next "takes forever" if the regexp translation is + # straightforward. See bpo-40480. + check('a' * 50 + 'b', '*a*a*a*a*a*a*a*a*a*a', False) + def test_mix_bytes_str(self): self.assertRaises(TypeError, fnmatch, 'test', b'*') self.assertRaises(TypeError, fnmatch, b'test', '*') @@ -89,6 +97,119 @@ class FnmatchTestCase(unittest.TestCase): check('usr/bin', 'usr\\bin', normsep) check('usr\\bin', 'usr\\bin') + def test_char_set(self): + ignorecase = os.path.normcase('ABC') == os.path.normcase('abc') + check = self.check_match + tescases = string.ascii_lowercase + string.digits + string.punctuation + for c in tescases: + check(c, '[az]', c in 'az') + check(c, '[!az]', c not in 'az') + # Case insensitive. + for c in tescases: + check(c, '[AZ]', (c in 'az') and ignorecase) + check(c, '[!AZ]', (c not in 'az') or not ignorecase) + for c in string.ascii_uppercase: + check(c, '[az]', (c in 'AZ') and ignorecase) + check(c, '[!az]', (c not in 'AZ') or not ignorecase) + # Repeated same character. + for c in tescases: + check(c, '[aa]', c == 'a') + # Special cases. + for c in tescases: + check(c, '[^az]', c in '^az') + check(c, '[[az]', c in '[az') + check(c, r'[!]]', c != ']') + check('[', '[') + check('[]', '[]') + check('[!', '[!') + check('[!]', '[!]') + + def test_range(self): + ignorecase = os.path.normcase('ABC') == os.path.normcase('abc') + normsep = os.path.normcase('\\') == os.path.normcase('/') + check = self.check_match + tescases = string.ascii_lowercase + string.digits + string.punctuation + for c in tescases: + check(c, '[b-d]', c in 'bcd') + check(c, '[!b-d]', c not in 'bcd') + check(c, '[b-dx-z]', c in 'bcdxyz') + check(c, '[!b-dx-z]', c not in 'bcdxyz') + # Case insensitive. + for c in tescases: + check(c, '[B-D]', (c in 'bcd') and ignorecase) + check(c, '[!B-D]', (c not in 'bcd') or not ignorecase) + for c in string.ascii_uppercase: + check(c, '[b-d]', (c in 'BCD') and ignorecase) + check(c, '[!b-d]', (c not in 'BCD') or not ignorecase) + # Upper bound == lower bound. + for c in tescases: + check(c, '[b-b]', c == 'b') + # Special cases. + for c in tescases: + check(c, '[!-#]', c not in '-#') + check(c, '[!--.]', c not in '-.') + check(c, '[^-`]', c in '^_`') + if not (normsep and c == '/'): + check(c, '[[-^]', c in r'[\]^') + check(c, r'[\-^]', c in r'\]^') + check(c, '[b-]', c in '-b') + check(c, '[!b-]', c not in '-b') + check(c, '[-b]', c in '-b') + check(c, '[!-b]', c not in '-b') + check(c, '[-]', c in '-') + check(c, '[!-]', c not in '-') + # Upper bound is less that lower bound: error in RE. + for c in tescases: + check(c, '[d-b]', False) + check(c, '[!d-b]', True) + check(c, '[d-bx-z]', c in 'xyz') + check(c, '[!d-bx-z]', c not in 'xyz') + check(c, '[d-b^-`]', c in '^_`') + if not (normsep and c == '/'): + check(c, '[d-b[-^]', c in r'[\]^') + + def test_sep_in_char_set(self): + normsep = os.path.normcase('\\') == os.path.normcase('/') + check = self.check_match + check('/', r'[/]') + check('\\', r'[\]') + check('/', r'[\]', normsep) + check('\\', r'[/]', normsep) + check('[/]', r'[/]', False) + check(r'[\\]', r'[/]', False) + check('\\', r'[\t]') + check('/', r'[\t]', normsep) + check('t', r'[\t]') + check('\t', r'[\t]', False) + + def test_sep_in_range(self): + normsep = os.path.normcase('\\') == os.path.normcase('/') + check = self.check_match + check('a/b', 'a[.-0]b', not normsep) + check('a\\b', 'a[.-0]b', False) + check('a\\b', 'a[Z-^]b', not normsep) + check('a/b', 'a[Z-^]b', False) + + check('a/b', 'a[/-0]b', not normsep) + check(r'a\b', 'a[/-0]b', False) + check('a[/-0]b', 'a[/-0]b', False) + check(r'a[\-0]b', 'a[/-0]b', False) + + check('a/b', 'a[.-/]b') + check(r'a\b', 'a[.-/]b', normsep) + check('a[.-/]b', 'a[.-/]b', False) + check(r'a[.-\]b', 'a[.-/]b', False) + + check(r'a\b', r'a[\-^]b') + check('a/b', r'a[\-^]b', normsep) + check(r'a[\-^]b', r'a[\-^]b', False) + check('a[/-^]b', r'a[\-^]b', False) + + check(r'a\b', r'a[Z-\]b', not normsep) + check('a/b', r'a[Z-\]b', False) + check(r'a[Z-\]b', r'a[Z-\]b', False) + check('a[Z-/]b', r'a[Z-\]b', False) + def test_warnings(self): with warnings.catch_warnings(): warnings.simplefilter('error', Warning) @@ -104,6 +225,7 @@ class FnmatchTestCase(unittest.TestCase): class TranslateTestCase(unittest.TestCase): def test_translate(self): + import re self.assertEqual(translate('*'), r'(?s:.*)\Z') self.assertEqual(translate('?'), r'(?s:.)\Z') self.assertEqual(translate('a?b*'), r'(?s:a.b.*)\Z') @@ -112,7 +234,34 @@ class TranslateTestCase(unittest.TestCase): self.assertEqual(translate('[!x]'), r'(?s:[^x])\Z') self.assertEqual(translate('[^x]'), r'(?s:[\^x])\Z') self.assertEqual(translate('[x'), r'(?s:\[x)\Z') - + # from the docs + self.assertEqual(translate('*.txt'), r'(?s:.*\.txt)\Z') + # squash consecutive stars + self.assertEqual(translate('*********'), r'(?s:.*)\Z') + self.assertEqual(translate('A*********'), r'(?s:A.*)\Z') + self.assertEqual(translate('*********A'), r'(?s:.*A)\Z') + self.assertEqual(translate('A*********?[?]?'), r'(?s:A.*.[?].)\Z') + # fancy translation to prevent exponential-time match failure + t = translate('**a*a****a') + digits = re.findall(r'\d+', t) + self.assertEqual(len(digits), 4) + self.assertEqual(digits[0], digits[1]) + self.assertEqual(digits[2], digits[3]) + g1 = f"g{digits[0]}" # e.g., group name "g4" + g2 = f"g{digits[2]}" # e.g., group name "g5" + self.assertEqual(t, + fr'(?s:(?=(?P<{g1}>.*?a))(?P={g1})(?=(?P<{g2}>.*?a))(?P={g2}).*a)\Z') + # and try pasting multiple translate results - it's an undocumented + # feature that this works; all the pain of generating unique group + # names across calls exists to support this + r1 = translate('**a**a**a*') + r2 = translate('**b**b**b*') + r3 = translate('*c*c*c*') + fatre = "|".join([r1, r2, r3]) + self.assertTrue(re.match(fatre, 'abaccad')) + self.assertTrue(re.match(fatre, 'abxbcab')) + self.assertTrue(re.match(fatre, 'cbabcaxc')) + self.assertFalse(re.match(fatre, 'dabccbad')) class FilterTestCase(unittest.TestCase):