updated ThresholdCounter with a bunch of functionality and docs

This commit is contained in:
Mahmoud Hashemi 2015-06-30 23:08:52 -07:00
parent 6be76fddd6
commit e2e5094bf6
2 changed files with 69 additions and 13 deletions

View File

@ -23,8 +23,12 @@ statistics are:
subset of misses, so this number is always less than or equal to subset of misses, so this number is always less than or equal to
``miss_count``. ``miss_count``.
Additionally, ``cacheutils`` the cache-like bounded counter,
:class:`ThresholdCounter`.
Learn more about `caching algorithms on Wikipedia Learn more about `caching algorithms on Wikipedia
<https://en.wikipedia.org/wiki/Cache_algorithms#Examples>`_. <https://en.wikipedia.org/wiki/Cache_algorithms#Examples>`_.
""" """
# TODO: clarify soft_miss_count. is it for .get and .set_default or is # TODO: clarify soft_miss_count. is it for .get and .set_default or is
@ -33,7 +37,7 @@ Learn more about `caching algorithms on Wikipedia
# TODO: TimedLRI # TODO: TimedLRI
# TODO: support 0 max_size? # TODO: support 0 max_size?
__all__ = ['LRI', 'LRU', 'ThresholdCache'] __all__ = ['LRI', 'LRU', 'cached', 'ThresholdCache']
import itertools import itertools
from collections import deque from collections import deque
@ -98,7 +102,7 @@ class LRU(dict):
(3, 1, 1) (3, 1, 1)
Other than the size-limiting caching behavior and statistics, Other than the size-limiting caching behavior and statistics,
``LRU`` acts like its parent class, the built-in Python dict. ``LRU`` acts like its parent class, the built-in Python :class:`dict`.
""" """
def __init__(self, max_size=DEFAULT_MAX_SIZE, values=None, def __init__(self, max_size=DEFAULT_MAX_SIZE, values=None,
on_miss=None): on_miss=None):
@ -444,7 +448,9 @@ class ThresholdCounter(object):
ThresholdCounter automatically compacts after every (1 / ThresholdCounter automatically compacts after every (1 /
*threshold*) additions, maintaining exact counts for any keys *threshold*) additions, maintaining exact counts for any keys
whose count represents at least a *threshold* ratio of the total whose count represents at least a *threshold* ratio of the total
data. data. In other words, if a particular key is not present in the
ThresholdCounter, its count represents less than *threshold* of
the total data.
>>> tc = ThresholdCounter(threshold=0.1) >>> tc = ThresholdCounter(threshold=0.1)
>>> tc.add(1) >>> tc.add(1)
@ -460,22 +466,27 @@ class ThresholdCounter(object):
11 11
As you can see above, the API is kept similar to As you can see above, the API is kept similar to
collections.Counter. The most notable feature omissions being that :class:`collections.Counter`. The most notable feature omissions
counted items cannot be set directly, uncounted, or removed, as being that counted items cannot be set directly, uncounted, or
this would disrupt the math. removed, as this would disrupt the math.
Use the ThresholdCounter when you need best-effort long-lived Use the ThresholdCounter when you need best-effort long-lived
counts for dynamically-keyed data. Without a bounded datastructure counts for dynamically-keyed data. Without a bounded datastructure
such as this one, the dynamic keys often represent a memory leak such as this one, the dynamic keys often represent a memory leak
and can impact application reliability. The ThresholdCounter's and can impact application reliability. The ThresholdCounter's
item replacement strategy can be thought of as *Amortized Least item replacement strategy is fully deterministic and can be
Relevant*. thought of as *Amortized Least Relevant*. The absolute upper bound
of keys it will store is *(2/threshold)*, but realistically
*(1/threshold)* is expected for uniformly random datastreams, and
one or two orders of magnitude better for real-world data.
This algorithm is an implementation of the Lossy Counting This algorithm is an implementation of the Lossy Counting
algorithm described in "Approximate Frequency Counts over Data algorithm described in "Approximate Frequency Counts over Data
Streams" by Manku & Motwani. Hat tip to Kurt Rose for discovery Streams" by Manku & Motwani. Hat tip to Kurt Rose for discovery
and initial implementation. and initial implementation.
""" """
# TODO: hit_count/miss_count?
def __init__(self, threshold=0.001): def __init__(self, threshold=0.001):
if not 0 < threshold < 1: if not 0 < threshold < 1:
raise ValueError('expected threshold between 0 and 1, not: %r' raise ValueError('expected threshold between 0 and 1, not: %r'
@ -492,6 +503,11 @@ class ThresholdCounter(object):
return self._threshold return self._threshold
def add(self, key): def add(self, key):
"""Increment the count of *key* by 1, automatically adding it if it
does not exist.
Cache compaction is triggered every *1/threshold* additions.
"""
self.total += 1 self.total += 1
try: try:
self._count_map[key][0] += 1 self._count_map[key][0] += 1
@ -502,12 +518,19 @@ class ThresholdCounter(object):
self._count_map = dict([(k, v) for k, v in self._count_map.items() self._count_map = dict([(k, v) for k, v in self._count_map.items()
if sum(v) > self._cur_bucket]) if sum(v) > self._cur_bucket])
self._cur_bucket += 1 self._cur_bucket += 1
return
def elements(self): def elements(self):
"""Return an iterator of all the common elements tracked by the
counter. Yields each key as many times as it has been seen.
"""
repeaters = itertools.starmap(itertools.repeat, self.iteritems()) repeaters = itertools.starmap(itertools.repeat, self.iteritems())
return itertools.chain.from_iterable(repeaters) return itertools.chain.from_iterable(repeaters)
def most_common(self, n=None): def most_common(self, n=None):
"""Get the top *n* keys and counts as tuples. If *n* is omitted,
returns all the pairs.
"""
if n <= 0: if n <= 0:
return [] return []
ret = sorted(self.iteritems(), key=lambda x: x[1][0], reverse=True) ret = sorted(self.iteritems(), key=lambda x: x[1][0], reverse=True)
@ -515,6 +538,29 @@ class ThresholdCounter(object):
return ret return ret
return ret[:n] return ret[:n]
def get_common_count(self):
"""Get the sum of counts for keys exceeding the configured data
threshold.
"""
return sum([count for count, _ in self._count_map.itervalues()])
def get_uncommon_count(self):
"""Get the sum of counts for keys that were culled because the
associated counts represented less than the configured
threshold. The long-tail counts.
"""
return self.total - self.get_common_count()
def get_commonality(self):
"""Get a float representation of the effective count accuracy. The
higher the number, the less uniform the keys being added, and
the higher accuracy and efficiency of the ThresholdCounter.
If a stronger measure of data cardinality is required,
consider using hyperloglog.
"""
return float(self.get_common_count()) / self.total
def __getitem__(self, key): def __getitem__(self, key):
return self._count_map[key][0] return self._count_map[key][0]
@ -547,15 +593,18 @@ class ThresholdCounter(object):
return list(self.iteritems()) return list(self.iteritems())
def get(self, key, default=0): def get(self, key, default=0):
"Get count for *key*, defaulting to 0."
try: try:
return self[key] return self[key]
except KeyError: except KeyError:
return default return default
def update(self, iterable, **kwargs): def update(self, iterable, **kwargs):
"""Like dict.update() but add counts instead of replacing them. """Like dict.update() but add counts instead of replacing them, used
to add multiple items in one call.
Source can be an iterable and a dictionary. Source can be an iterable of keys to add, or a mapping of keys
to integer counts.
""" """
if iterable is not None: if iterable is not None:
if callable(getattr(iterable, 'iteritems', None)): if callable(getattr(iterable, 'iteritems', None)):

View File

@ -39,6 +39,13 @@ values: the :func:`cached` function decorator.
.. autofunction:: boltons.cacheutils.cached .. autofunction:: boltons.cacheutils.cached
Similar functionality can be found in Python 3.4's :mod:`functools` Similar functionality can be found in Python 3.4's
module, though it is made for cache pluggability and does not support :func:`functools.lru_cache` decorator, but the functools approach does
sharing the cache object across multiple functions. not support the same cache strategy modification or sharing the cache
object across multiple functions.
Threshold-bounded Counting
--------------------------
.. autoclass:: boltons.cacheutils.ThresholdCounter
:members: