bugfix of the bugfix

This commit is contained in:
svlandeg 2020-06-02 17:49:33 +02:00
parent fdfd822936
commit 5b350a6c99
1 changed files with 16 additions and 15 deletions

View File

@ -674,25 +674,26 @@ def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_o
for example in examples: for example in examples:
n_words = count_words(example.doc) n_words = count_words(example.doc)
# if the current example exceeds the batch size, it is returned separately
# but only if discard_oversize=False.
if n_words > target_size:
if not discard_oversize:
yield [example]
# add the example to the current batch if it still fits # add the example to the current batch if it still fits
if (current_size + n_words) < (target_size + tol_size): elif (current_size + n_words) < (target_size + tol_size):
batch.append(example) batch.append(example)
current_size += n_words current_size += n_words
# yield the previous batch and start a new one
else: else:
# if the current example exceeds the batch size, it is returned separately yield batch
# but only if discard_oversize=False. target_size = next(size_)
if current_size > target_size: tol_size = target_size * tolerance
if not discard_oversize: # In theory it may happen that the current example now exceeds the new target_size,
yield [example] # but that seems like an unimportant edge case if batch sizes are variable anyway?
# yield the previous batch and start a new one batch = [example]
else: current_size = n_words
yield batch
target_size = next(size_)
tol_size = target_size * tolerance
# In theory it may happen that the current example now exceeds the new target_size,
# but that seems like an unimportant edge case if batch sizes are variable anyway?
batch = [example]
current_size = n_words
# yield the final batch # yield the final batch
if batch: if batch: