fog/experiments/levenshtein_split.py

82 lines
1.7 KiB
Python
Raw Normal View History

2018-09-17 14:21:45 +00:00
def ngrams_count(l, n):
return l - (n - 1)
def ngrams_upper_bound(l, m):
n = l // m
return ngrams_count(l, n) + ngrams_count(l, n + 1)
def powerset(a, b):
return (a + 1) * (b + 1) - 1
def keys_count(a, b):
return powerset(a, b) * 2 - a - b
def formula(k):
if k % 2 != 0:
2018-09-17 16:20:02 +00:00
return ((k + 1) ** 2) / 2 + k + 1
2018-09-17 14:21:45 +00:00
else:
return (k ** 2) / 2 + 2 * k + 1
2018-09-17 16:20:02 +00:00
def multiset_powerset(multiset):
n = len(multiset)
c = [0] * n
while True:
changed = False
i = n - 1
while i >= 0 and not changed:
if c[i] < multiset[i]:
# Increment
c[i] += 1
changed = True
else:
# Roll over
c[i] = 0
i -= 1
2018-09-17 14:21:45 +00:00
2018-09-17 16:20:02 +00:00
if changed:
yield c
else:
break
def keys(k, string):
2018-09-17 14:21:45 +00:00
m = k + 1
2018-09-17 16:20:02 +00:00
l = len(string)
2018-09-17 14:21:45 +00:00
2018-09-17 16:20:02 +00:00
a = l % m
2018-09-17 14:21:45 +00:00
b = m - a
2018-09-17 16:20:02 +00:00
nb = l // m
na = nb + 1
for ca, cb in multiset_powerset((a, b)):
s = ca * na + cb * nb
o = (ca + cb) - 1
if ca > 0:
yield (o, string[s - na:s])
if cb > 0:
yield (o, string[s - nb:s])
if __name__ == '__main__':
for k in range(1, 10):
m = k + 1
l_max = m * 2 - 1
l_med = m + m // 2
a = l_med % m
b = m - a
print('Stats for k = %i:' % k)
print(' - k (distance threshold): %i' % k)
print(' - m (number of chunks): %i' % m)
print(' - min nb of keys: %i (%i with tr)' % (m, m * 2))
print(' - max nb of keys: %i (with formula: %i)' % (keys_count(a, b), formula(k)))
print(' - %s%s (%i, %i)' % ('a' * a, 'b' * b, a, b))
print()