#!/usr/bin/env python # This file is part of exrex. # # exrex is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # exrex is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with exrex. If not, see < http://www.gnu.org/licenses/ >. # # (C) 2012- by Adam Tauber, try: from future_builtins import map, range except: pass from re import sre_parse from itertools import product, chain, tee from random import choice,randint __all__ = ('generate', 'CATEGORIES', 'count', 'parse', 'getone') CATEGORIES = {'category_space' : sorted(sre_parse.WHITESPACE) ,'category_digit' : sorted(sre_parse.DIGITS) ,'category_any' : [chr(x) for x in range(32, 123)] } def comb(g, i): for c in g: g2,i = tee(i) for c2 in g2: yield c+c2 def mappend(g, c): for cc in g: yield cc+c def _in(d): ret = [] neg = False for i in d: if i[0] == 'range': subs = map(chr, range(i[1][0], i[1][1]+1)) if neg: for char in subs: try: ret.remove(char) except: pass else: ret.extend(subs) elif i[0] == 'literal': if neg: try: ret.remove(chr(i[1])) except: pass else: ret.append(chr(i[1])) elif i[0] == 'category': subs = CATEGORIES.get(i[1], ['']) if neg: for char in subs: try: ret.remove(char) except: pass else: ret.extend(subs) elif i[0] == 'negate': ret = list(CATEGORIES['category_any']) neg = True return ret def prods(orig, ran, items): for o in orig: for r in ran: for s in product(items, repeat=r): yield o+''.join(s) def ggen(g1, f, *args, **kwargs): for a in g1: g2 = f(*args, **kwargs) if isinstance(g2, int): yield g2 else: for b in g2: yield a+b def _gen(d, limit=20, count=False): """docstring for _gen""" ret = [''] strings = 0 for i in d: if i[0] == 'in': subs = _in(i[1]) if count: strings = (strings or 1) * len(subs) ret = comb(ret, subs) elif i[0] == 'literal': ret = mappend(ret, chr(i[1])) elif i[0] == 'category': subs = CATEGORIES.get(i[1], ['']) if count: strings = (strings or 1) * len(subs) ret = comb(ret, subs) elif i[0] == 'any': subs = CATEGORIES['category_any'] if count: strings = (strings or 1) * len(subs) ret = comb(ret, subs) elif i[0] == 'max_repeat': chars = filter(None, _gen(list(i[1][2]), limit)) if i[1][1]+1 - i[1][0] >= limit: ran = range(i[1][0], i[1][0]+limit) else: ran = range(i[1][0], i[1][1]+1) if count: for i in ran: strings += pow(len(chars), i) ret = prods(ret, ran, chars) elif i[0] == 'branch': subs = list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1])) if count: strings = (strings or 1) * (len(subs) or 1) ret = comb(ret, subs) elif i[0] == 'subpattern': if count: strings = (strings or 1) * (sum(ggen([0], _gen, i[1][1], limit=limit, count=True)) or 1) ret = ggen(ret, _gen, i[1][1], limit=limit, count=False) # ignore ^ and $ elif i[0] == 'at': continue elif i[0] == 'not_literal': subs = list(CATEGORIES['category_any']) subs.remove(chr(i[1])) if count: strings = (strings or 1) * len(subs) ret = comb(ret, subs) elif i[0] == 'assert': print i[1][1] continue else: print('[!] cannot handle expression ' + repr(i)) if count: return strings return ret def _randone(d, limit=20): """docstring for _randone""" ret = '' for i in d: if i[0] == 'in': ret += choice(_in(i[1])) elif i[0] == 'literal': ret += chr(i[1]) elif i[0] == 'category': ret += choice(CATEGORIES.get(i[1], [''])) elif i[0] == 'any': ret += choice(CATEGORIES['category_any']) elif i[0] == 'max_repeat': chars = filter(None, _gen(list(i[1][2]), limit)) if i[1][1]+1 - i[1][0] >= limit: min,max = i[1][0], i[1][0]+limit else: min,max = i[1][0], i[1][1] for _ in range(randint(min, max)): ret += choice(chars) elif i[0] == 'branch': ret += choice(list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1]))) elif i[0] == 'subpattern': ret += _randone(i[1][1], limit) elif i[0] == 'at': continue elif i[0] == 'not_literal': c=list(CATEGORIES['category_any']) c.remove(chr(i[1])) ret += choice(c) else: print('[!] cannot handle expression "%s"' % str(i)) return ret def parse(s): """Regular expression parser :param s: Regular expression :type s: str :rtype: list """ r = sre_parse.parse(s) return list(r) def generate(s, limit=20): """Creates a generator that generates all matching strings to a given regular expression :param s: Regular expression :type s: str :param limit: Range limit :type limit: int :returns: string generator object """ return _gen(parse(s), limit) def count(s, limit=20): """Counts all matching strings to a given regular expression :param s: Regular expression :type s: str :param limit: Range limit :type limit: int :rtype: int :returns: number of matching strings """ return _gen(parse(s), limit, count=True) def getone(regex_string, limit=20): """Returns a random matching string to a given regular expression """ return _randone(parse(regex_string), limit) def argparser(): import argparse from sys import stdout argp = argparse.ArgumentParser(description='exrex - regular expression string generator') argp.add_argument('-o', '--output' ,help = 'Output file - default is STDOUT' ,metavar = 'FILE' ,default = stdout ,type = argparse.FileType('w') ) argp.add_argument('-l', '--limit' ,help = 'Max limit for range size - default is 20' ,default = 20 ,action = 'store' ,type = int ,metavar = 'N' ) argp.add_argument('-c', '--count' ,help = 'Count matching strings' ,default = False ,action = 'store_true' ) argp.add_argument('-r', '--random' ,help = 'Returns a random string that matches to the regex' ,default = False ,action = 'store_true' ) argp.add_argument('-d', '--delimiter' ,help = 'Delimiter - default is \\n' ,default = '\n' ) argp.add_argument('-v', '--verbose' ,action = 'store_true' ,help = 'Verbose mode' ,default = False ) argp.add_argument('regex' ,metavar = 'REGEX' ,help = 'REGEX string' ) return vars(argp.parse_args()) def __main__(): from sys import exit, stderr # 'as(d|f)qw(e|r|s)[a-zA-Z]{2,3}' # 'as(QWE|Z([XC]|Y|U)V){2,3}asdf' # '.?' # '.+' # 'asdf.{1,4}qwer{2,5}' # 'a(b)?(c)?(d)?' # 'a[b][c][d]?[e]? args = argparser() if args['verbose']: args['output'].write('%r%s' % (parse(args['regex'], limit=args['limit']), args['delimiter'])) if args['count']: args['output'].write('%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter'])) exit(0) if args['random']: args['output'].write('%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter'])) exit(0) try: g = generate(args['regex'], args['limit']) except Exception, e: print >> stderr, '[!] Error: ', e exit(1) for s in g: try: args['output'].write(s+args['delimiter']) except: break if __name__ == '__main__': __main__()