updated ThresholdCounter with a bunch of functionality and docs

This commit is contained in:
Mahmoud Hashemi 2015-06-30 23:08:52 -07:00
parent 6be76fddd6
commit e2e5094bf6
2 changed files with 69 additions and 13 deletions

View File

@ -23,8 +23,12 @@ statistics are:
subset of misses, so this number is always less than or equal to
``miss_count``.
Additionally, ``cacheutils`` the cache-like bounded counter,
:class:`ThresholdCounter`.
Learn more about `caching algorithms on Wikipedia
<https://en.wikipedia.org/wiki/Cache_algorithms#Examples>`_.
"""
# TODO: clarify soft_miss_count. is it for .get and .set_default or is
@ -33,7 +37,7 @@ Learn more about `caching algorithms on Wikipedia
# TODO: TimedLRI
# TODO: support 0 max_size?
__all__ = ['LRI', 'LRU', 'ThresholdCache']
__all__ = ['LRI', 'LRU', 'cached', 'ThresholdCache']
import itertools
from collections import deque
@ -98,7 +102,7 @@ class LRU(dict):
(3, 1, 1)
Other than the size-limiting caching behavior and statistics,
``LRU`` acts like its parent class, the built-in Python dict.
``LRU`` acts like its parent class, the built-in Python :class:`dict`.
"""
def __init__(self, max_size=DEFAULT_MAX_SIZE, values=None,
on_miss=None):
@ -444,7 +448,9 @@ class ThresholdCounter(object):
ThresholdCounter automatically compacts after every (1 /
*threshold*) additions, maintaining exact counts for any keys
whose count represents at least a *threshold* ratio of the total
data.
data. In other words, if a particular key is not present in the
ThresholdCounter, its count represents less than *threshold* of
the total data.
>>> tc = ThresholdCounter(threshold=0.1)
>>> tc.add(1)
@ -460,22 +466,27 @@ class ThresholdCounter(object):
11
As you can see above, the API is kept similar to
collections.Counter. The most notable feature omissions being that
counted items cannot be set directly, uncounted, or removed, as
this would disrupt the math.
:class:`collections.Counter`. The most notable feature omissions
being that counted items cannot be set directly, uncounted, or
removed, as this would disrupt the math.
Use the ThresholdCounter when you need best-effort long-lived
counts for dynamically-keyed data. Without a bounded datastructure
such as this one, the dynamic keys often represent a memory leak
and can impact application reliability. The ThresholdCounter's
item replacement strategy can be thought of as *Amortized Least
Relevant*.
item replacement strategy is fully deterministic and can be
thought of as *Amortized Least Relevant*. The absolute upper bound
of keys it will store is *(2/threshold)*, but realistically
*(1/threshold)* is expected for uniformly random datastreams, and
one or two orders of magnitude better for real-world data.
This algorithm is an implementation of the Lossy Counting
algorithm described in "Approximate Frequency Counts over Data
Streams" by Manku & Motwani. Hat tip to Kurt Rose for discovery
and initial implementation.
"""
# TODO: hit_count/miss_count?
def __init__(self, threshold=0.001):
if not 0 < threshold < 1:
raise ValueError('expected threshold between 0 and 1, not: %r'
@ -492,6 +503,11 @@ class ThresholdCounter(object):
return self._threshold
def add(self, key):
"""Increment the count of *key* by 1, automatically adding it if it
does not exist.
Cache compaction is triggered every *1/threshold* additions.
"""
self.total += 1
try:
self._count_map[key][0] += 1
@ -502,12 +518,19 @@ class ThresholdCounter(object):
self._count_map = dict([(k, v) for k, v in self._count_map.items()
if sum(v) > self._cur_bucket])
self._cur_bucket += 1
return
def elements(self):
"""Return an iterator of all the common elements tracked by the
counter. Yields each key as many times as it has been seen.
"""
repeaters = itertools.starmap(itertools.repeat, self.iteritems())
return itertools.chain.from_iterable(repeaters)
def most_common(self, n=None):
"""Get the top *n* keys and counts as tuples. If *n* is omitted,
returns all the pairs.
"""
if n <= 0:
return []
ret = sorted(self.iteritems(), key=lambda x: x[1][0], reverse=True)
@ -515,6 +538,29 @@ class ThresholdCounter(object):
return ret
return ret[:n]
def get_common_count(self):
"""Get the sum of counts for keys exceeding the configured data
threshold.
"""
return sum([count for count, _ in self._count_map.itervalues()])
def get_uncommon_count(self):
"""Get the sum of counts for keys that were culled because the
associated counts represented less than the configured
threshold. The long-tail counts.
"""
return self.total - self.get_common_count()
def get_commonality(self):
"""Get a float representation of the effective count accuracy. The
higher the number, the less uniform the keys being added, and
the higher accuracy and efficiency of the ThresholdCounter.
If a stronger measure of data cardinality is required,
consider using hyperloglog.
"""
return float(self.get_common_count()) / self.total
def __getitem__(self, key):
return self._count_map[key][0]
@ -547,15 +593,18 @@ class ThresholdCounter(object):
return list(self.iteritems())
def get(self, key, default=0):
"Get count for *key*, defaulting to 0."
try:
return self[key]
except KeyError:
return default
def update(self, iterable, **kwargs):
"""Like dict.update() but add counts instead of replacing them.
"""Like dict.update() but add counts instead of replacing them, used
to add multiple items in one call.
Source can be an iterable and a dictionary.
Source can be an iterable of keys to add, or a mapping of keys
to integer counts.
"""
if iterable is not None:
if callable(getattr(iterable, 'iteritems', None)):

View File

@ -39,6 +39,13 @@ values: the :func:`cached` function decorator.
.. autofunction:: boltons.cacheutils.cached
Similar functionality can be found in Python 3.4's :mod:`functools`
module, though it is made for cache pluggability and does not support
sharing the cache object across multiple functions.
Similar functionality can be found in Python 3.4's
:func:`functools.lru_cache` decorator, but the functools approach does
not support the same cache strategy modification or sharing the cache
object across multiple functions.
Threshold-bounded Counting
--------------------------
.. autoclass:: boltons.cacheutils.ThresholdCounter
:members: