From 3e6aafe20993f5b3690c870bf88eed3d8ba53492 Mon Sep 17 00:00:00 2001 From: Raymond Hettinger Date: Tue, 5 Mar 2013 02:15:01 -0500 Subject: [PATCH] Issue #16098: Update heapq.nsmallest to use the same algorithm as nlargest. This removes the dependency on bisect and it bring the pure Python code in-sync with the C code. --- Lib/heapq.py | 84 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/Lib/heapq.py b/Lib/heapq.py index 6a4e0f4c31c..ca79db15295 100644 --- a/Lib/heapq.py +++ b/Lib/heapq.py @@ -129,9 +129,8 @@ __all__ = ['heappush', 'heappop', 'heapify', 'heapreplace', 'merge', 'nlargest', 'nsmallest', 'heappushpop'] -from itertools import islice, repeat, count, imap, izip, tee, chain +from itertools import islice, count, imap, izip, tee, chain from operator import itemgetter -import bisect def cmp_lt(x, y): # Use __lt__ if available; otherwise, try __le__. @@ -188,6 +187,19 @@ def heapify(x): for i in reversed(xrange(n//2)): _siftup(x, i) +def _heappushpop_max(heap, item): + """Maxheap version of a heappush followed by a heappop.""" + if heap and cmp_lt(item, heap[0]): + item, heap[0] = heap[0], item + _siftup_max(heap, 0) + return item + +def _heapify_max(x): + """Transform list into a maxheap, in-place, in O(len(x)) time.""" + n = len(x) + for i in reversed(range(n//2)): + _siftup_max(x, i) + def nlargest(n, iterable): """Find the n largest elements in a dataset. @@ -213,30 +225,16 @@ def nsmallest(n, iterable): """ if n < 0: return [] - if hasattr(iterable, '__len__') and n * 10 <= len(iterable): - # For smaller values of n, the bisect method is faster than a minheap. - # It is also memory efficient, consuming only n elements of space. - it = iter(iterable) - result = sorted(islice(it, 0, n)) - if not result: - return result - insort = bisect.insort - pop = result.pop - los = result[-1] # los --> Largest of the nsmallest - for elem in it: - if cmp_lt(elem, los): - insort(result, elem) - pop() - los = result[-1] + it = iter(iterable) + result = list(islice(it, n)) + if not result: return result - # An alternative approach manifests the whole iterable in memory but - # saves comparisons by heapifying all at once. Also, saves time - # over bisect.insort() which has O(n) data movement time for every - # insertion. Finding the n smallest of an m length iterable requires - # O(m) + O(n log m) comparisons. - h = list(iterable) - heapify(h) - return map(heappop, repeat(h, min(n, len(h)))) + _heapify_max(result) + _heappushpop = _heappushpop_max + for elem in it: + _heappushpop(result, elem) + result.sort() + return result # 'heap' is a heap at all indices >= startpos, except possibly for pos. pos # is the index of a leaf with a possibly out-of-order value. Restore the @@ -314,6 +312,42 @@ def _siftup(heap, pos): heap[pos] = newitem _siftdown(heap, startpos, pos) +def _siftdown_max(heap, startpos, pos): + 'Maxheap variant of _siftdown' + newitem = heap[pos] + # Follow the path to the root, moving parents down until finding a place + # newitem fits. + while pos > startpos: + parentpos = (pos - 1) >> 1 + parent = heap[parentpos] + if cmp_lt(parent, newitem): + heap[pos] = parent + pos = parentpos + continue + break + heap[pos] = newitem + +def _siftup_max(heap, pos): + 'Maxheap variant of _siftup' + endpos = len(heap) + startpos = pos + newitem = heap[pos] + # Bubble up the larger child until hitting a leaf. + childpos = 2*pos + 1 # leftmost child position + while childpos < endpos: + # Set childpos to index of larger child. + rightpos = childpos + 1 + if rightpos < endpos and not cmp_lt(heap[rightpos], heap[childpos]): + childpos = rightpos + # Move the larger child up. + heap[pos] = heap[childpos] + pos = childpos + childpos = 2*pos + 1 + # The leaf at pos is empty now. Put newitem there, and bubble it up + # to its final resting place (by sifting its parents down). + heap[pos] = newitem + _siftdown_max(heap, startpos, pos) + # If available, use C implementation try: from _heapq import *