add chunk_ranges function to iterutils (#312)

This commit is contained in:
Jonathan Striebel 2023-02-20 07:22:09 +01:00 committed by GitHub
parent 40a7b47c6e
commit 243de3fb2c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 75 additions and 3 deletions

View File

@ -323,6 +323,13 @@ def chunked(src, size, count=None, **kw):
return list(itertools.islice(chunk_iter, count))
def _validate_positive_int(value, name, strictly_positive=True):
value = int(value)
if value < 0 or (strictly_positive and value == 0):
raise ValueError('expected a positive integer ' + name)
return value
def chunked_iter(src, size, **kw):
"""Generates *size*-sized chunks from *src* iterable. Unless the
optional *fill* keyword argument is provided, iterables not evenly
@ -339,9 +346,7 @@ def chunked_iter(src, size, **kw):
# TODO: add count kwarg?
if not is_iterable(src):
raise TypeError('expected an iterable')
size = int(size)
if size <= 0:
raise ValueError('expected a positive integer chunk size')
size = _validate_positive_int(size, 'chunk size')
do_fill = True
try:
fill_val = kw.pop('fill')
@ -369,6 +374,56 @@ def chunked_iter(src, size, **kw):
return
def chunk_ranges(input_size, chunk_size, input_offset=0, overlap_size=0, align=False):
"""Generates *chunk_size*-sized chunk ranges for an input with length *input_size*.
Optionally, a start of the input can be set via *input_offset*, and
and overlap between the chunks may be specified via *overlap_size*.
Also, if *align* is set to *True*, any items with *i % (chunk_size-overlap_size) == 0*
are always at the beginning of the chunk.
Returns an iterator of (start, end) tuples, one tuple per chunk.
>>> list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5))
[(10, 15), (15, 20)]
>>> list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5, overlap_size=1))
[(10, 15), (14, 19), (18, 20)]
>>> list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5, overlap_size=2))
[(10, 15), (13, 18), (16, 20)]
>>> list(chunk_ranges(input_offset=4, input_size=15, chunk_size=5, align=False))
[(4, 9), (9, 14), (14, 19)]
>>> list(chunk_ranges(input_offset=4, input_size=15, chunk_size=5, align=True))
[(4, 5), (5, 10), (10, 15), (15, 19)]
>>> list(chunk_ranges(input_offset=2, input_size=15, chunk_size=5, overlap_size=1, align=False))
[(2, 7), (6, 11), (10, 15), (14, 17)]
>>> list(chunk_ranges(input_offset=2, input_size=15, chunk_size=5, overlap_size=1, align=True))
[(2, 5), (4, 9), (8, 13), (12, 17)]
>>> list(chunk_ranges(input_offset=3, input_size=15, chunk_size=5, overlap_size=1, align=True))
[(3, 5), (4, 9), (8, 13), (12, 17), (16, 18)]
"""
input_size = _validate_positive_int(input_size, 'input_size', strictly_positive=False)
chunk_size = _validate_positive_int(chunk_size, 'chunk_size')
input_offset = _validate_positive_int(input_offset, 'input_offset', strictly_positive=False)
overlap_size = _validate_positive_int(overlap_size, 'overlap_size', strictly_positive=False)
input_stop = input_offset + input_size
if align:
initial_chunk_len = chunk_size - input_offset % (chunk_size - overlap_size)
if initial_chunk_len != overlap_size:
yield (input_offset, min(input_offset + initial_chunk_len, input_stop))
if input_offset + initial_chunk_len >= input_stop:
return
input_offset = input_offset + initial_chunk_len - overlap_size
for i in range(input_offset, input_stop, chunk_size - overlap_size):
yield (i, min(i + chunk_size, input_stop))
if i + chunk_size >= input_stop:
return
def pairwise(src):
"""Convenience function for calling :func:`windowed` on *src*, with
*size* set to 2.

View File

@ -18,6 +18,7 @@ present in the standard library.
.. autofunction:: chunked
.. autofunction:: chunked_iter
.. autofunction:: chunk_ranges
.. autofunction:: pairwise
.. autofunction:: pairwise_iter
.. autofunction:: windowed

View File

@ -511,6 +511,22 @@ def test_chunked_bytes():
assert chunked(b'123', 2) in (['12', '3'], [b'12', b'3'])
def test_chunk_ranges():
from boltons.iterutils import chunk_ranges
assert list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5)) == [(10, 15), (15, 20)]
assert list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5, overlap_size=1)) == [(10, 15), (14, 19), (18, 20)]
assert list(chunk_ranges(input_offset=10, input_size=10, chunk_size=5, overlap_size=2)) == [(10, 15), (13, 18), (16, 20)]
assert list(chunk_ranges(input_offset=4, input_size=15, chunk_size=5, align=False)) == [(4, 9), (9, 14), (14, 19)]
assert list(chunk_ranges(input_offset=4, input_size=15, chunk_size=5, align=True)) == [(4, 5), (5, 10), (10, 15), (15, 19)]
assert list(chunk_ranges(input_offset=2, input_size=15, chunk_size=5, overlap_size=1, align=False)) == [(2, 7), (6, 11), (10, 15), (14, 17)]
assert list(chunk_ranges(input_offset=2, input_size=15, chunk_size=5, overlap_size=1, align=True)) == [(2, 5), (4, 9), (8, 13), (12, 17)]
assert list(chunk_ranges(input_offset=3, input_size=15, chunk_size=5, overlap_size=1, align=True)) == [(3, 5), (4, 9), (8, 13), (12, 17), (16, 18)]
assert list(chunk_ranges(input_offset=3, input_size=2, chunk_size=5, overlap_size=1, align=True)) == [(3, 5)]
def test_lstrip():
from boltons.iterutils import lstrip