Add itershuffle utility function. Maybe belongs in thinc

2017-05-21 09:05:05 -05:00 · 2017-05-21 09:05:05 -05:00 · 0731971bfc
parent 3b7c108246
commit 0731971bfc
1 changed files with 26 additions and 0 deletions
--- a/spacy/util.py
+++ b/spacy/util.py
@ -9,6 +9,7 @@ import regex as re
 from pathlib import Path
 import sys
 import textwrap
 import random
 from .symbols import ORTH
 from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
@ -172,6 +173,31 @@ def get_async(stream, numpy_array):
        array.set(numpy_array, stream=stream)
        return array
 def itershuffle(iterable, bufsize=1000):
    """Shuffle an iterator. This works by holding `bufsize` items back
    and yielding them sometime later. Obviously, this is not unbiased --
    but should be good enough for batching. Larger bufsize means less bias.
    From https://gist.github.com/andres-erbsen/1307752
    """
    iterable = iter(iterable)
    buf = []
    try:
        while True:
            for i in range(random.randint(1, bufsize-len(buf))):
                buf.append(iterable.next())
            random.shuffle(buf)
            for i in range(random.randint(1, bufsize)):
                if buf:
                    yield buf.pop()
                else:
                    break
    except StopIteration:
        random.shuffle(buf)
        while buf:
            yield buf.pop()
        raise StopIteration
 def env_opt(name, default=None):
    if type(default) is float: