mirror of https://github.com/mahmoud/boltons.git
updated ThresholdCounter with a bunch of functionality and docs
This commit is contained in:
parent
6be76fddd6
commit
e2e5094bf6
|
@ -23,8 +23,12 @@ statistics are:
|
||||||
subset of misses, so this number is always less than or equal to
|
subset of misses, so this number is always less than or equal to
|
||||||
``miss_count``.
|
``miss_count``.
|
||||||
|
|
||||||
|
Additionally, ``cacheutils`` the cache-like bounded counter,
|
||||||
|
:class:`ThresholdCounter`.
|
||||||
|
|
||||||
Learn more about `caching algorithms on Wikipedia
|
Learn more about `caching algorithms on Wikipedia
|
||||||
<https://en.wikipedia.org/wiki/Cache_algorithms#Examples>`_.
|
<https://en.wikipedia.org/wiki/Cache_algorithms#Examples>`_.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# TODO: clarify soft_miss_count. is it for .get and .set_default or is
|
# TODO: clarify soft_miss_count. is it for .get and .set_default or is
|
||||||
|
@ -33,7 +37,7 @@ Learn more about `caching algorithms on Wikipedia
|
||||||
|
|
||||||
# TODO: TimedLRI
|
# TODO: TimedLRI
|
||||||
# TODO: support 0 max_size?
|
# TODO: support 0 max_size?
|
||||||
__all__ = ['LRI', 'LRU', 'ThresholdCache']
|
__all__ = ['LRI', 'LRU', 'cached', 'ThresholdCache']
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
@ -98,7 +102,7 @@ class LRU(dict):
|
||||||
(3, 1, 1)
|
(3, 1, 1)
|
||||||
|
|
||||||
Other than the size-limiting caching behavior and statistics,
|
Other than the size-limiting caching behavior and statistics,
|
||||||
``LRU`` acts like its parent class, the built-in Python dict.
|
``LRU`` acts like its parent class, the built-in Python :class:`dict`.
|
||||||
"""
|
"""
|
||||||
def __init__(self, max_size=DEFAULT_MAX_SIZE, values=None,
|
def __init__(self, max_size=DEFAULT_MAX_SIZE, values=None,
|
||||||
on_miss=None):
|
on_miss=None):
|
||||||
|
@ -444,7 +448,9 @@ class ThresholdCounter(object):
|
||||||
ThresholdCounter automatically compacts after every (1 /
|
ThresholdCounter automatically compacts after every (1 /
|
||||||
*threshold*) additions, maintaining exact counts for any keys
|
*threshold*) additions, maintaining exact counts for any keys
|
||||||
whose count represents at least a *threshold* ratio of the total
|
whose count represents at least a *threshold* ratio of the total
|
||||||
data.
|
data. In other words, if a particular key is not present in the
|
||||||
|
ThresholdCounter, its count represents less than *threshold* of
|
||||||
|
the total data.
|
||||||
|
|
||||||
>>> tc = ThresholdCounter(threshold=0.1)
|
>>> tc = ThresholdCounter(threshold=0.1)
|
||||||
>>> tc.add(1)
|
>>> tc.add(1)
|
||||||
|
@ -460,22 +466,27 @@ class ThresholdCounter(object):
|
||||||
11
|
11
|
||||||
|
|
||||||
As you can see above, the API is kept similar to
|
As you can see above, the API is kept similar to
|
||||||
collections.Counter. The most notable feature omissions being that
|
:class:`collections.Counter`. The most notable feature omissions
|
||||||
counted items cannot be set directly, uncounted, or removed, as
|
being that counted items cannot be set directly, uncounted, or
|
||||||
this would disrupt the math.
|
removed, as this would disrupt the math.
|
||||||
|
|
||||||
Use the ThresholdCounter when you need best-effort long-lived
|
Use the ThresholdCounter when you need best-effort long-lived
|
||||||
counts for dynamically-keyed data. Without a bounded datastructure
|
counts for dynamically-keyed data. Without a bounded datastructure
|
||||||
such as this one, the dynamic keys often represent a memory leak
|
such as this one, the dynamic keys often represent a memory leak
|
||||||
and can impact application reliability. The ThresholdCounter's
|
and can impact application reliability. The ThresholdCounter's
|
||||||
item replacement strategy can be thought of as *Amortized Least
|
item replacement strategy is fully deterministic and can be
|
||||||
Relevant*.
|
thought of as *Amortized Least Relevant*. The absolute upper bound
|
||||||
|
of keys it will store is *(2/threshold)*, but realistically
|
||||||
|
*(1/threshold)* is expected for uniformly random datastreams, and
|
||||||
|
one or two orders of magnitude better for real-world data.
|
||||||
|
|
||||||
This algorithm is an implementation of the Lossy Counting
|
This algorithm is an implementation of the Lossy Counting
|
||||||
algorithm described in "Approximate Frequency Counts over Data
|
algorithm described in "Approximate Frequency Counts over Data
|
||||||
Streams" by Manku & Motwani. Hat tip to Kurt Rose for discovery
|
Streams" by Manku & Motwani. Hat tip to Kurt Rose for discovery
|
||||||
and initial implementation.
|
and initial implementation.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
# TODO: hit_count/miss_count?
|
||||||
def __init__(self, threshold=0.001):
|
def __init__(self, threshold=0.001):
|
||||||
if not 0 < threshold < 1:
|
if not 0 < threshold < 1:
|
||||||
raise ValueError('expected threshold between 0 and 1, not: %r'
|
raise ValueError('expected threshold between 0 and 1, not: %r'
|
||||||
|
@ -492,6 +503,11 @@ class ThresholdCounter(object):
|
||||||
return self._threshold
|
return self._threshold
|
||||||
|
|
||||||
def add(self, key):
|
def add(self, key):
|
||||||
|
"""Increment the count of *key* by 1, automatically adding it if it
|
||||||
|
does not exist.
|
||||||
|
|
||||||
|
Cache compaction is triggered every *1/threshold* additions.
|
||||||
|
"""
|
||||||
self.total += 1
|
self.total += 1
|
||||||
try:
|
try:
|
||||||
self._count_map[key][0] += 1
|
self._count_map[key][0] += 1
|
||||||
|
@ -502,12 +518,19 @@ class ThresholdCounter(object):
|
||||||
self._count_map = dict([(k, v) for k, v in self._count_map.items()
|
self._count_map = dict([(k, v) for k, v in self._count_map.items()
|
||||||
if sum(v) > self._cur_bucket])
|
if sum(v) > self._cur_bucket])
|
||||||
self._cur_bucket += 1
|
self._cur_bucket += 1
|
||||||
|
return
|
||||||
|
|
||||||
def elements(self):
|
def elements(self):
|
||||||
|
"""Return an iterator of all the common elements tracked by the
|
||||||
|
counter. Yields each key as many times as it has been seen.
|
||||||
|
"""
|
||||||
repeaters = itertools.starmap(itertools.repeat, self.iteritems())
|
repeaters = itertools.starmap(itertools.repeat, self.iteritems())
|
||||||
return itertools.chain.from_iterable(repeaters)
|
return itertools.chain.from_iterable(repeaters)
|
||||||
|
|
||||||
def most_common(self, n=None):
|
def most_common(self, n=None):
|
||||||
|
"""Get the top *n* keys and counts as tuples. If *n* is omitted,
|
||||||
|
returns all the pairs.
|
||||||
|
"""
|
||||||
if n <= 0:
|
if n <= 0:
|
||||||
return []
|
return []
|
||||||
ret = sorted(self.iteritems(), key=lambda x: x[1][0], reverse=True)
|
ret = sorted(self.iteritems(), key=lambda x: x[1][0], reverse=True)
|
||||||
|
@ -515,6 +538,29 @@ class ThresholdCounter(object):
|
||||||
return ret
|
return ret
|
||||||
return ret[:n]
|
return ret[:n]
|
||||||
|
|
||||||
|
def get_common_count(self):
|
||||||
|
"""Get the sum of counts for keys exceeding the configured data
|
||||||
|
threshold.
|
||||||
|
"""
|
||||||
|
return sum([count for count, _ in self._count_map.itervalues()])
|
||||||
|
|
||||||
|
def get_uncommon_count(self):
|
||||||
|
"""Get the sum of counts for keys that were culled because the
|
||||||
|
associated counts represented less than the configured
|
||||||
|
threshold. The long-tail counts.
|
||||||
|
"""
|
||||||
|
return self.total - self.get_common_count()
|
||||||
|
|
||||||
|
def get_commonality(self):
|
||||||
|
"""Get a float representation of the effective count accuracy. The
|
||||||
|
higher the number, the less uniform the keys being added, and
|
||||||
|
the higher accuracy and efficiency of the ThresholdCounter.
|
||||||
|
|
||||||
|
If a stronger measure of data cardinality is required,
|
||||||
|
consider using hyperloglog.
|
||||||
|
"""
|
||||||
|
return float(self.get_common_count()) / self.total
|
||||||
|
|
||||||
def __getitem__(self, key):
|
def __getitem__(self, key):
|
||||||
return self._count_map[key][0]
|
return self._count_map[key][0]
|
||||||
|
|
||||||
|
@ -547,15 +593,18 @@ class ThresholdCounter(object):
|
||||||
return list(self.iteritems())
|
return list(self.iteritems())
|
||||||
|
|
||||||
def get(self, key, default=0):
|
def get(self, key, default=0):
|
||||||
|
"Get count for *key*, defaulting to 0."
|
||||||
try:
|
try:
|
||||||
return self[key]
|
return self[key]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return default
|
return default
|
||||||
|
|
||||||
def update(self, iterable, **kwargs):
|
def update(self, iterable, **kwargs):
|
||||||
"""Like dict.update() but add counts instead of replacing them.
|
"""Like dict.update() but add counts instead of replacing them, used
|
||||||
|
to add multiple items in one call.
|
||||||
|
|
||||||
Source can be an iterable and a dictionary.
|
Source can be an iterable of keys to add, or a mapping of keys
|
||||||
|
to integer counts.
|
||||||
"""
|
"""
|
||||||
if iterable is not None:
|
if iterable is not None:
|
||||||
if callable(getattr(iterable, 'iteritems', None)):
|
if callable(getattr(iterable, 'iteritems', None)):
|
||||||
|
|
|
@ -39,6 +39,13 @@ values: the :func:`cached` function decorator.
|
||||||
|
|
||||||
.. autofunction:: boltons.cacheutils.cached
|
.. autofunction:: boltons.cacheutils.cached
|
||||||
|
|
||||||
Similar functionality can be found in Python 3.4's :mod:`functools`
|
Similar functionality can be found in Python 3.4's
|
||||||
module, though it is made for cache pluggability and does not support
|
:func:`functools.lru_cache` decorator, but the functools approach does
|
||||||
sharing the cache object across multiple functions.
|
not support the same cache strategy modification or sharing the cache
|
||||||
|
object across multiple functions.
|
||||||
|
|
||||||
|
Threshold-bounded Counting
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
.. autoclass:: boltons.cacheutils.ThresholdCounter
|
||||||
|
:members:
|
||||||
|
|
Loading…
Reference in New Issue