exrex/exrex.py

306 lines
9.4 KiB
Python

#!/usr/bin/env python
# This file is part of exrex.
#
# exrex is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# exrex is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with exrex. If not, see < http://www.gnu.org/licenses/ >.
#
# (C) 2012- by Adam Tauber, <asciimoo@gmail.com>
try:
from future_builtins import map, range
except:
pass
from re import sre_parse
from itertools import product, chain, tee
from random import choice,randint
__all__ = ('generate', 'CATEGORIES', 'count', 'parse', 'getone')
CATEGORIES = {'category_space' : sorted(sre_parse.WHITESPACE)
,'category_digit' : sorted(sre_parse.DIGITS)
,'category_any' : [chr(x) for x in range(32, 123)]
}
def comb(g, i):
for c in g:
g2,i = tee(i)
for c2 in g2:
yield c+c2
def mappend(g, c):
for cc in g:
yield cc+c
def _in(d):
ret = []
neg = False
for i in d:
if i[0] == 'range':
subs = map(chr, range(i[1][0], i[1][1]+1))
if neg:
for char in subs:
try:
ret.remove(char)
except:
pass
else:
ret.extend(subs)
elif i[0] == 'literal':
if neg:
try:
ret.remove(chr(i[1]))
except:
pass
else:
ret.append(chr(i[1]))
elif i[0] == 'category':
subs = CATEGORIES.get(i[1], [''])
if neg:
for char in subs:
try:
ret.remove(char)
except:
pass
else:
ret.extend(subs)
elif i[0] == 'negate':
ret = list(CATEGORIES['category_any'])
neg = True
return ret
def prods(orig, ran, items):
for o in orig:
for r in ran:
for s in product(items, repeat=r):
yield o+''.join(s)
def ggen(g1, f, *args, **kwargs):
for a in g1:
g2 = f(*args, **kwargs)
if isinstance(g2, int):
yield g2
else:
for b in g2:
yield a+b
def _gen(d, limit=20, count=False):
"""docstring for _gen"""
ret = ['']
strings = 0
for i in d:
if i[0] == 'in':
subs = _in(i[1])
if count:
strings = (strings or 1) * len(subs)
ret = comb(ret, subs)
elif i[0] == 'literal':
ret = mappend(ret, chr(i[1]))
elif i[0] == 'category':
subs = CATEGORIES.get(i[1], [''])
if count:
strings = (strings or 1) * len(subs)
ret = comb(ret, subs)
elif i[0] == 'any':
subs = CATEGORIES['category_any']
if count:
strings = (strings or 1) * len(subs)
ret = comb(ret, subs)
elif i[0] == 'max_repeat':
chars = filter(None, _gen(list(i[1][2]), limit))
if i[1][1]+1 - i[1][0] >= limit:
ran = range(i[1][0], i[1][0]+limit)
else:
ran = range(i[1][0], i[1][1]+1)
if count:
for i in ran:
strings += pow(len(chars), i)
ret = prods(ret, ran, chars)
elif i[0] == 'branch':
subs = list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1]))
if count:
strings = (strings or 1) * (len(subs) or 1)
ret = comb(ret, subs)
elif i[0] == 'subpattern':
if count:
strings = (strings or 1) * (sum(ggen([0], _gen, i[1][1], limit=limit, count=True)) or 1)
ret = ggen(ret, _gen, i[1][1], limit=limit, count=False)
# ignore ^ and $
elif i[0] == 'at':
continue
elif i[0] == 'not_literal':
subs = list(CATEGORIES['category_any'])
subs.remove(chr(i[1]))
if count:
strings = (strings or 1) * len(subs)
ret = comb(ret, subs)
elif i[0] == 'assert':
print i[1][1]
continue
else:
print('[!] cannot handle expression ' + repr(i))
if count:
return strings
return ret
def _randone(d, limit=20):
"""docstring for _randone"""
ret = ''
for i in d:
if i[0] == 'in':
ret += choice(_in(i[1]))
elif i[0] == 'literal':
ret += chr(i[1])
elif i[0] == 'category':
ret += choice(CATEGORIES.get(i[1], ['']))
elif i[0] == 'any':
ret += choice(CATEGORIES['category_any'])
elif i[0] == 'max_repeat':
chars = filter(None, _gen(list(i[1][2]), limit))
if i[1][1]+1 - i[1][0] >= limit:
min,max = i[1][0], i[1][0]+limit
else:
min,max = i[1][0], i[1][1]
for _ in range(randint(min, max)):
ret += choice(chars)
elif i[0] == 'branch':
ret += choice(list(chain.from_iterable(_gen(list(x), limit) for x in i[1][1])))
elif i[0] == 'subpattern':
ret += _randone(i[1][1], limit)
elif i[0] == 'at':
continue
elif i[0] == 'not_literal':
c=list(CATEGORIES['category_any'])
c.remove(chr(i[1]))
ret += choice(c)
else:
print('[!] cannot handle expression "%s"' % str(i))
return ret
def parse(s):
"""Regular expression parser
:param s: Regular expression
:type s: str
:rtype: list
"""
r = sre_parse.parse(s)
return list(r)
def generate(s, limit=20):
"""Creates a generator that generates all matching strings to a given regular expression
:param s: Regular expression
:type s: str
:param limit: Range limit
:type limit: int
:returns: string generator object
"""
return _gen(parse(s), limit)
def count(s, limit=20):
"""Counts all matching strings to a given regular expression
:param s: Regular expression
:type s: str
:param limit: Range limit
:type limit: int
:rtype: int
:returns: number of matching strings
"""
return _gen(parse(s), limit, count=True)
def getone(regex_string, limit=20):
"""Returns a random matching string to a given regular expression
"""
return _randone(parse(regex_string), limit)
def argparser():
import argparse
from sys import stdout
argp = argparse.ArgumentParser(description='exrex - regular expression string generator')
argp.add_argument('-o', '--output'
,help = 'Output file - default is STDOUT'
,metavar = 'FILE'
,default = stdout
,type = argparse.FileType('w')
)
argp.add_argument('-l', '--limit'
,help = 'Max limit for range size - default is 20'
,default = 20
,action = 'store'
,type = int
,metavar = 'N'
)
argp.add_argument('-c', '--count'
,help = 'Count matching strings'
,default = False
,action = 'store_true'
)
argp.add_argument('-r', '--random'
,help = 'Returns a random string that matches to the regex'
,default = False
,action = 'store_true'
)
argp.add_argument('-d', '--delimiter'
,help = 'Delimiter - default is \\n'
,default = '\n'
)
argp.add_argument('-v', '--verbose'
,action = 'store_true'
,help = 'Verbose mode'
,default = False
)
argp.add_argument('regex'
,metavar = 'REGEX'
,help = 'REGEX string'
)
return vars(argp.parse_args())
def __main__():
from sys import exit, stderr
# 'as(d|f)qw(e|r|s)[a-zA-Z]{2,3}'
# 'as(QWE|Z([XC]|Y|U)V){2,3}asdf'
# '.?'
# '.+'
# 'asdf.{1,4}qwer{2,5}'
# 'a(b)?(c)?(d)?'
# 'a[b][c][d]?[e]?
args = argparser()
if args['verbose']:
args['output'].write('%r%s' % (parse(args['regex'], limit=args['limit']), args['delimiter']))
if args['count']:
args['output'].write('%d%s' % (count(args['regex'], limit=args['limit']), args['delimiter']))
exit(0)
if args['random']:
args['output'].write('%s%s' % (getone(args['regex'], limit=args['limit']), args['delimiter']))
exit(0)
try:
g = generate(args['regex'], args['limit'])
except Exception, e:
print >> stderr, '[!] Error: ', e
exit(1)
for s in g:
try:
args['output'].write(s+args['delimiter'])
except:
break
if __name__ == '__main__':
__main__()