Cache hash codes (#426)

* First stab at implementing hashcode caching (#423)

Currently all existing tests pass but no cache_hash tests have yet
been added.

* Existing hash tests now pass on cache_hash classes

* Add towncrier change log

* Add documentation for cache_hash

* Fixes bug with check that init=True if cache_hash=True

* Fix long lines

* Fix documentation issues

* Add test for cache_hash requiring init

* Improve test coverage

* Remove now unnecessary 'pass'

* Add periods to the end of exception strings

* Add test docstrings for cache_hash tests

* Clarify documentation of cache_hash

* Recommend that hashable classes be frozen

* Fix test references for exception messages
This commit is contained in:
Ryan Gabbard 2018-08-20 00:46:52 -04:00 committed by Hynek Schlawack
parent 123df67041
commit 3e0ecbd891
9 changed files with 248 additions and 31 deletions

View File

@ -0,0 +1 @@
Added ``cache_hash`` option to ``@attr.s`` which causes the hash code to be computed once and stored on the object.

View File

@ -18,7 +18,7 @@ What follows is the API explanation, if you'd like a more hands-on introduction,
Core
----
.. autofunction:: attr.s(these=None, repr_ns=None, repr=True, cmp=True, hash=None, init=True, slots=False, frozen=False, str=False, auto_attribs=False)
.. autofunction:: attr.s(these=None, repr_ns=None, repr=True, cmp=True, hash=None, init=True, slots=False, frozen=False, str=False, auto_attribs=False, cache_hash=False)
.. note::

View File

@ -1,6 +1,9 @@
Hashing
=======
Hash Method Generation
----------------------
.. warning::
The overarching theme is to never set the ``@attr.s(hash=X)`` parameter yourself.
@ -18,7 +21,7 @@ The *hash* of an object is an integer that represents the contents of an object.
It can be obtained by calling :func:`hash` on an object and is implemented by writing a ``__hash__`` method for your class.
``attrs`` will happily write a ``__hash__`` method you [#fn1]_, however it will *not* do so by default.
Because according to the definition_ from the official Python docs, the returned hash has to fullfil certain constraints:
Because according to the definition_ from the official Python docs, the returned hash has to fulfill certain constraints:
#. Two objects that are equal, **must** have the same hash.
This means that if ``x == y``, it *must* follow that ``hash(x) == hash(y)``.
@ -50,6 +53,20 @@ Because according to the definition_ from the official Python docs, the returned
For a more thorough explanation of this topic, please refer to this blog post: `Python Hashes and Equality`_.
Hashing and Mutability
----------------------
Changing any field involved in hash code computation after the first call to `__hash__` (typically this would be after its insertion into a hash-based collection) can result in silent bugs.
Therefore, it is strongly recommended that hashable classes be ``frozen``.
Hash Code Caching
-----------------
Some objects have hash codes which are expensive to compute.
If such objects are to be stored in hash-based collections, it can be useful to compute the hash codes only once and then store the result on the object to make future hash code requests fast.
To enable caching of hash codes, pass ``cache_hash=True`` to ``@attrs``.
This may only be done if ``attrs`` is already generating a hash function for the object.
.. [#fn1] The hash is computed by hashing a tuple that consists of an unique id for the class plus all attribute values.
.. _definition: https://docs.python.org/3/glossary.html#term-hashable

View File

@ -348,6 +348,7 @@ If you need to set attributes on a frozen class, you'll have to resort to the :r
>>> Frozen(1)
Frozen(x=1, y=2)
Note that you *must not* access the hash code of the object in ``__attrs_post__init__`` if ``cache_hash=True``.
.. _`Wiki page`: https://github.com/python-attrs/attrs/wiki/Extensions-to-attrs
.. _`get confused`: https://github.com/python-attrs/attrs/issues/289

View File

@ -167,6 +167,7 @@ def attrs(
str: bool = ...,
auto_attribs: bool = ...,
kw_only: bool = ...,
cache_hash: bool = ...,
) -> _C: ...
@overload
def attrs(
@ -182,6 +183,7 @@ def attrs(
str: bool = ...,
auto_attribs: bool = ...,
kw_only: bool = ...,
cache_hash: bool = ...,
) -> Callable[[_C], _C]: ...
# TODO: add support for returning NamedTuple from the mypy plugin
@ -208,6 +210,7 @@ def make_class(
str: bool = ...,
auto_attribs: bool = ...,
kw_only: bool = ...,
cache_hash: bool = ...,
) -> type: ...
# _funcs --

View File

@ -33,6 +33,10 @@ _init_converter_pat = "__attr_converter_{}"
_init_factory_pat = "__attr_factory_{}"
_tuple_property_pat = " {attr_name} = property(itemgetter({index}))"
_classvar_prefixes = ("typing.ClassVar", "t.ClassVar", "ClassVar")
# we don't use a double-underscore prefix because that triggers
# name mangling when trying to create a slot for the field
# (when slots=True)
_hash_cache_field = "_attrs_cached_hash"
_empty_metadata_singleton = metadata_proxy({})
@ -446,12 +450,15 @@ class _ClassBuilder(object):
"_attr_names",
"_slots",
"_frozen",
"_cache_hash",
"_has_post_init",
"_delete_attribs",
"_super_attr_map",
)
def __init__(self, cls, these, slots, frozen, auto_attribs, kw_only):
def __init__(
self, cls, these, slots, frozen, auto_attribs, kw_only, cache_hash
):
attrs, super_attrs, super_map = _transform_attrs(
cls, these, auto_attribs, kw_only
)
@ -464,6 +471,7 @@ class _ClassBuilder(object):
self._attr_names = tuple(a.name for a in attrs)
self._slots = slots
self._frozen = frozen or _has_frozen_superclass(cls)
self._cache_hash = cache_hash
self._has_post_init = bool(getattr(cls, "__attrs_post_init__", False))
self._delete_attribs = not bool(these)
@ -522,9 +530,12 @@ class _ClassBuilder(object):
# We only add the names of attributes that aren't inherited.
# Settings __slots__ to inherited attributes wastes memory.
cd["__slots__"] = tuple(
slot_names = [
name for name in self._attr_names if name not in super_names
)
]
if self._cache_hash:
slot_names.append(_hash_cache_field)
cd["__slots__"] = tuple(slot_names)
qualname = getattr(self._cls, "__qualname__", None)
if qualname is not None:
@ -603,7 +614,9 @@ class _ClassBuilder(object):
def add_hash(self):
self._cls_dict["__hash__"] = self._add_method_dunders(
_make_hash(self._attrs)
_make_hash(
self._attrs, frozen=self._frozen, cache_hash=self._cache_hash
)
)
return self
@ -615,6 +628,7 @@ class _ClassBuilder(object):
self._has_post_init,
self._frozen,
self._slots,
self._cache_hash,
self._super_attr_map,
)
)
@ -664,6 +678,7 @@ def attrs(
str=False,
auto_attribs=False,
kw_only=False,
cache_hash=False,
):
r"""
A class decorator that adds `dunder
@ -764,6 +779,12 @@ def attrs(
:param bool kw_only: Make all attributes keyword-only (Python 3+)
in the generated ``__init__`` (if ``init`` is ``False``, this
parameter is ignored).
:param bool cache_hash: Ensure that the object's hash code is computed
only once and stored on the object. If this is set to ``True``,
hashing must be either explicitly or implicitly enabled for this
class. If the hash code is cached, then no attributes of this
class which participate in hash code computation may be mutated
after object creation.
.. versionadded:: 16.0.0 *slots*
@ -782,6 +803,7 @@ def attrs(
each other. ``__eq`` and ``__ne__`` never tried to compared subclasses
to each other.
.. versionadded:: 18.2.0 *kw_only*
.. versionadded:: 18.2.0 *cache_hash*
"""
def wrap(cls):
@ -789,7 +811,7 @@ def attrs(
raise TypeError("attrs only works with new-style classes.")
builder = _ClassBuilder(
cls, these, slots, frozen, auto_attribs, kw_only
cls, these, slots, frozen, auto_attribs, kw_only, cache_hash
)
if repr is True:
@ -805,14 +827,31 @@ def attrs(
"Invalid value for hash. Must be True, False, or None."
)
elif hash is False or (hash is None and cmp is False):
pass
if cache_hash:
raise TypeError(
"Invalid value for cache_hash. To use hash caching,"
" hashing must be either explicitly or implicitly "
"enabled."
)
elif hash is True or (hash is None and cmp is True and frozen is True):
builder.add_hash()
else:
if cache_hash:
raise TypeError(
"Invalid value for cache_hash. To use hash caching,"
" hashing must be either explicitly or implicitly "
"enabled."
)
builder.make_unhashable()
if init is True:
builder.add_init()
else:
if cache_hash:
raise TypeError(
"Invalid value for cache_hash. To use hash caching,"
" init must be True."
)
return builder.build_class()
@ -862,29 +901,54 @@ def _attrs_to_tuple(obj, attrs):
return tuple(getattr(obj, a.name) for a in attrs)
def _make_hash(attrs):
def _make_hash(attrs, frozen, cache_hash):
attrs = tuple(
a
for a in attrs
if a.hash is True or (a.hash is None and a.cmp is True)
)
tab = " "
# We cache the generated hash methods for the same kinds of attributes.
sha1 = hashlib.sha1()
sha1.update(repr(attrs).encode("utf-8"))
unique_filename = "<attrs generated hash %s>" % (sha1.hexdigest(),)
type_hash = hash(unique_filename)
lines = [
"def __hash__(self):",
" return hash((",
" %d," % (type_hash,),
]
for a in attrs:
lines.append(" self.%s," % (a.name))
lines.append(" ))")
method_lines = ["def __hash__(self):"]
script = "\n".join(lines)
def append_hash_computation_lines(prefix, indent):
"""
Generate the code for actually computing the hash code.
Below this will either be returned directly or used to compute
a value which is then cached, depending on the value of cache_hash
"""
method_lines.extend(
[indent + prefix + "hash((", indent + " %d," % (type_hash,)]
)
for a in attrs:
method_lines.append(indent + " self.%s," % a.name)
method_lines.append(indent + " ))")
if cache_hash:
method_lines.append(tab + "if self.%s is None:" % _hash_cache_field)
if frozen:
append_hash_computation_lines(
"object.__setattr__(self, '%s', " % _hash_cache_field, tab * 2
)
method_lines.append(tab * 2 + ")") # close __setattr__
else:
append_hash_computation_lines(
"self.%s = " % _hash_cache_field, tab * 2
)
method_lines.append(tab + "return self.%s" % _hash_cache_field)
else:
append_hash_computation_lines("return ", tab)
script = "\n".join(method_lines)
globs = {}
locs = {}
bytecode = compile(script, unique_filename, "exec")
@ -906,7 +970,7 @@ def _add_hash(cls, attrs):
"""
Add a hash method to *cls*.
"""
cls.__hash__ = _make_hash(attrs)
cls.__hash__ = _make_hash(attrs, frozen=False, cache_hash=False)
return cls
@ -1108,7 +1172,7 @@ def _add_repr(cls, ns=None, attrs=None):
return cls
def _make_init(attrs, post_init, frozen, slots, super_attr_map):
def _make_init(attrs, post_init, frozen, slots, cache_hash, super_attr_map):
attrs = [a for a in attrs if a.init or a.default is not NOTHING]
# We cache the generated init methods for the same kinds of attributes.
@ -1117,7 +1181,7 @@ def _make_init(attrs, post_init, frozen, slots, super_attr_map):
unique_filename = "<attrs generated init {0}>".format(sha1.hexdigest())
script, globs, annotations = _attrs_to_init_script(
attrs, frozen, slots, post_init, super_attr_map
attrs, frozen, slots, post_init, cache_hash, super_attr_map
)
locs = {}
bytecode = compile(script, unique_filename, "exec")
@ -1152,7 +1216,8 @@ def _add_init(cls, frozen):
getattr(cls, "__attrs_post_init__", False),
frozen,
_is_slot_cls(cls),
{},
cache_hash=False,
super_attr_map={},
)
return cls
@ -1241,7 +1306,9 @@ def _is_slot_attr(a_name, super_attr_map):
return a_name in super_attr_map and _is_slot_cls(super_attr_map[a_name])
def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map):
def _attrs_to_init_script(
attrs, frozen, slots, post_init, cache_hash, super_attr_map
):
"""
Return a script of an initializer for *attrs* and a dict of globals.
@ -1259,6 +1326,7 @@ def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map):
lines.append(
# Circumvent the __setattr__ descriptor to save one lookup per
# assignment.
# Note _setattr will be used again below if cache_hash is True
"_setattr = _cached_setattr.__get__(self, self.__class__)"
)
@ -1280,6 +1348,7 @@ def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map):
# Dict frozen classes assign directly to __dict__.
# But only if the attribute doesn't come from an ancestor slot
# class.
# Note _inst_dict will be used again below if cache_hash is True
lines.append("_inst_dict = self.__dict__")
if any_slot_ancestors:
lines.append(
@ -1470,6 +1539,23 @@ def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map):
if post_init:
lines.append("self.__attrs_post_init__()")
# because this is set only after __attrs_post_init is called, a crash
# will result if post-init tries to access the hash code. This seemed
# preferable to setting this beforehand, in which case alteration to
# field values during post-init combined with post-init accessing the
# hash code would result in silent bugs.
if cache_hash:
if frozen:
if slots:
# if frozen and slots, then _setattr defined above
init_hash_cache = "_setattr('%s', %s)"
else:
# if frozen and not slots, then _inst_dict defined above
init_hash_cache = "_inst_dict['%s'] = %s"
else:
init_hash_cache = "self.%s = %s"
lines.append(init_hash_cache % (_hash_cache_field, "None"))
args = ", ".join(args)
if kw_only_args:
if PY2:

View File

@ -33,9 +33,19 @@ ReprC = simple_class(repr=True)
ReprCSlots = simple_class(repr=True, slots=True)
# HashC is hashable by explicit definition while HashCSlots is hashable
# implicitly.
# implicitly. The "Cached" versions are the same, except with hash code
# caching enabled
HashC = simple_class(hash=True)
HashCSlots = simple_class(hash=None, cmp=True, frozen=True, slots=True)
HashCCached = simple_class(hash=True, cache_hash=True)
HashCSlotsCached = simple_class(
hash=None, cmp=True, frozen=True, slots=True, cache_hash=True
)
# the cached hash code is stored slightly differently in this case
# so it needs to be tested separately
HashCFrozenNotSlotsCached = simple_class(
frozen=True, slots=False, hash=True, cache_hash=True
)
class InitC(object):
@ -283,8 +293,42 @@ class TestAddHash(object):
assert exc_args == e.value.args
@given(booleans())
def test_hash_attribute(self, slots):
def test_enforce_no_cache_hash_without_hash(self):
"""
Ensure exception is thrown if caching the hash code is requested
but attrs is not requested to generate `__hash__`.
"""
exc_args = (
"Invalid value for cache_hash. To use hash caching,"
" hashing must be either explicitly or implicitly "
"enabled.",
)
with pytest.raises(TypeError) as e:
make_class("C", {}, hash=False, cache_hash=True)
assert exc_args == e.value.args
# unhashable case
with pytest.raises(TypeError) as e:
make_class(
"C", {}, hash=None, cmp=True, frozen=False, cache_hash=True
)
assert exc_args == e.value.args
def test_enforce_no_cached_hash_without_init(self):
"""
Ensure exception is thrown if caching the hash code is requested
but attrs is not requested to generate `__init__`.
"""
exc_args = (
"Invalid value for cache_hash. To use hash caching,"
" init must be True.",
)
with pytest.raises(TypeError) as e:
make_class("C", {}, init=False, hash=True, cache_hash=True)
assert exc_args == e.value.args
@given(booleans(), booleans())
def test_hash_attribute(self, slots, cache_hash):
"""
If `hash` is False on an attribute, ignore that attribute.
"""
@ -293,6 +337,7 @@ class TestAddHash(object):
{"a": attr.ib(hash=False), "b": attr.ib()},
slots=slots,
hash=True,
cache_hash=cache_hash,
)
assert hash(C(1, 2)) == hash(C(2, 2))
@ -331,12 +376,26 @@ class TestAddHash(object):
assert C(1) != C(1)
assert hash(C(1)) != hash(C(1))
@pytest.mark.parametrize("cls", [HashC, HashCSlots])
@pytest.mark.parametrize(
"cls",
[
HashC,
HashCSlots,
HashCCached,
HashCSlotsCached,
HashCFrozenNotSlotsCached,
],
)
def test_hash_works(self, cls):
"""
__hash__ returns different hashes for different values.
"""
assert hash(cls(1, 2)) != hash(cls(1, 1))
a = cls(1, 2)
b = cls(1, 1)
assert hash(a) != hash(b)
# perform the test again to test the pre-cached path through
# __hash__ for the cached-hash versions
assert hash(a) != hash(b)
def test_hash_default(self):
"""
@ -352,6 +411,48 @@ class TestAddHash(object):
"unhashable type: 'C'", # CPython
)
def test_cache_hashing(self):
"""
Ensure that hash computation if cached if and only if requested
"""
class HashCounter:
"""
A class for testing which counts how many times its hash
has been requested
"""
def __init__(self):
self.times_hash_called = 0
def __hash__(self):
self.times_hash_called += 1
return 12345
Uncached = make_class(
"Uncached",
{"hash_counter": attr.ib(factory=HashCounter)},
hash=True,
cache_hash=False,
)
Cached = make_class(
"Cached",
{"hash_counter": attr.ib(factory=HashCounter)},
hash=True,
cache_hash=True,
)
uncached_instance = Uncached()
cached_instance = Cached()
hash(uncached_instance)
hash(uncached_instance)
hash(cached_instance)
hash(cached_instance)
assert 2 == uncached_instance.hash_counter.times_hash_called
assert 1 == cached_instance.hash_counter.times_hash_called
class TestAddInit(object):
"""

View File

@ -1368,7 +1368,7 @@ class TestClassBuilder(object):
class C(object):
pass
b = _ClassBuilder(C, None, True, True, False, False)
b = _ClassBuilder(C, None, True, True, False, False, False)
assert "<_ClassBuilder(cls=C)>" == repr(b)
@ -1380,7 +1380,7 @@ class TestClassBuilder(object):
class C(object):
x = attr.ib()
b = _ClassBuilder(C, None, True, True, False, False)
b = _ClassBuilder(C, None, True, True, False, False, False)
cls = (
b.add_cmp()
@ -1443,6 +1443,7 @@ class TestClassBuilder(object):
frozen=False,
auto_attribs=False,
kw_only=False,
cache_hash=False,
)
b._cls = {} # no __module__; no __qualname__

View File

@ -9,7 +9,13 @@ from attr._make import NOTHING, make_class
def simple_class(
cmp=False, repr=False, hash=False, str=False, slots=False, frozen=False
cmp=False,
repr=False,
hash=False,
str=False,
slots=False,
frozen=False,
cache_hash=False,
):
"""
Return a new simple class.
@ -24,6 +30,7 @@ def simple_class(
slots=slots,
str=str,
frozen=frozen,
cache_hash=cache_hash,
)