From 3e0ecbd891532e8f85f90ebbbd56da78cedb9688 Mon Sep 17 00:00:00 2001 From: Ryan Gabbard Date: Mon, 20 Aug 2018 00:46:52 -0400 Subject: [PATCH] Cache hash codes (#426) * First stab at implementing hashcode caching (#423) Currently all existing tests pass but no cache_hash tests have yet been added. * Existing hash tests now pass on cache_hash classes * Add towncrier change log * Add documentation for cache_hash * Fixes bug with check that init=True if cache_hash=True * Fix long lines * Fix documentation issues * Add test for cache_hash requiring init * Improve test coverage * Remove now unnecessary 'pass' * Add periods to the end of exception strings * Add test docstrings for cache_hash tests * Clarify documentation of cache_hash * Recommend that hashable classes be frozen * Fix test references for exception messages --- changelog.d/425.change.rst | 1 + docs/api.rst | 2 +- docs/hashing.rst | 19 +++++- docs/init.rst | 1 + src/attr/__init__.pyi | 3 + src/attr/_make.py | 128 +++++++++++++++++++++++++++++++------ tests/test_dunders.py | 111 ++++++++++++++++++++++++++++++-- tests/test_make.py | 5 +- tests/utils.py | 9 ++- 9 files changed, 248 insertions(+), 31 deletions(-) create mode 100644 changelog.d/425.change.rst diff --git a/changelog.d/425.change.rst b/changelog.d/425.change.rst new file mode 100644 index 00000000..c23168c8 --- /dev/null +++ b/changelog.d/425.change.rst @@ -0,0 +1 @@ +Added ``cache_hash`` option to ``@attr.s`` which causes the hash code to be computed once and stored on the object. diff --git a/docs/api.rst b/docs/api.rst index ba3bc18f..cfda359b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -18,7 +18,7 @@ What follows is the API explanation, if you'd like a more hands-on introduction, Core ---- -.. autofunction:: attr.s(these=None, repr_ns=None, repr=True, cmp=True, hash=None, init=True, slots=False, frozen=False, str=False, auto_attribs=False) +.. autofunction:: attr.s(these=None, repr_ns=None, repr=True, cmp=True, hash=None, init=True, slots=False, frozen=False, str=False, auto_attribs=False, cache_hash=False) .. note:: diff --git a/docs/hashing.rst b/docs/hashing.rst index 8bfd0413..ba18cc83 100644 --- a/docs/hashing.rst +++ b/docs/hashing.rst @@ -1,6 +1,9 @@ Hashing ======= +Hash Method Generation +---------------------- + .. warning:: The overarching theme is to never set the ``@attr.s(hash=X)`` parameter yourself. @@ -18,7 +21,7 @@ The *hash* of an object is an integer that represents the contents of an object. It can be obtained by calling :func:`hash` on an object and is implemented by writing a ``__hash__`` method for your class. ``attrs`` will happily write a ``__hash__`` method you [#fn1]_, however it will *not* do so by default. -Because according to the definition_ from the official Python docs, the returned hash has to fullfil certain constraints: +Because according to the definition_ from the official Python docs, the returned hash has to fulfill certain constraints: #. Two objects that are equal, **must** have the same hash. This means that if ``x == y``, it *must* follow that ``hash(x) == hash(y)``. @@ -50,6 +53,20 @@ Because according to the definition_ from the official Python docs, the returned For a more thorough explanation of this topic, please refer to this blog post: `Python Hashes and Equality`_. +Hashing and Mutability +---------------------- +Changing any field involved in hash code computation after the first call to `__hash__` (typically this would be after its insertion into a hash-based collection) can result in silent bugs. +Therefore, it is strongly recommended that hashable classes be ``frozen``. + + +Hash Code Caching +----------------- + +Some objects have hash codes which are expensive to compute. +If such objects are to be stored in hash-based collections, it can be useful to compute the hash codes only once and then store the result on the object to make future hash code requests fast. +To enable caching of hash codes, pass ``cache_hash=True`` to ``@attrs``. +This may only be done if ``attrs`` is already generating a hash function for the object. + .. [#fn1] The hash is computed by hashing a tuple that consists of an unique id for the class plus all attribute values. .. _definition: https://docs.python.org/3/glossary.html#term-hashable diff --git a/docs/init.rst b/docs/init.rst index e2cdd039..41a86ca7 100644 --- a/docs/init.rst +++ b/docs/init.rst @@ -348,6 +348,7 @@ If you need to set attributes on a frozen class, you'll have to resort to the :r >>> Frozen(1) Frozen(x=1, y=2) +Note that you *must not* access the hash code of the object in ``__attrs_post__init__`` if ``cache_hash=True``. .. _`Wiki page`: https://github.com/python-attrs/attrs/wiki/Extensions-to-attrs .. _`get confused`: https://github.com/python-attrs/attrs/issues/289 diff --git a/src/attr/__init__.pyi b/src/attr/__init__.pyi index 34376a2c..0c2b346c 100644 --- a/src/attr/__init__.pyi +++ b/src/attr/__init__.pyi @@ -167,6 +167,7 @@ def attrs( str: bool = ..., auto_attribs: bool = ..., kw_only: bool = ..., + cache_hash: bool = ..., ) -> _C: ... @overload def attrs( @@ -182,6 +183,7 @@ def attrs( str: bool = ..., auto_attribs: bool = ..., kw_only: bool = ..., + cache_hash: bool = ..., ) -> Callable[[_C], _C]: ... # TODO: add support for returning NamedTuple from the mypy plugin @@ -208,6 +210,7 @@ def make_class( str: bool = ..., auto_attribs: bool = ..., kw_only: bool = ..., + cache_hash: bool = ..., ) -> type: ... # _funcs -- diff --git a/src/attr/_make.py b/src/attr/_make.py index 71c0f23b..ecb668ba 100644 --- a/src/attr/_make.py +++ b/src/attr/_make.py @@ -33,6 +33,10 @@ _init_converter_pat = "__attr_converter_{}" _init_factory_pat = "__attr_factory_{}" _tuple_property_pat = " {attr_name} = property(itemgetter({index}))" _classvar_prefixes = ("typing.ClassVar", "t.ClassVar", "ClassVar") +# we don't use a double-underscore prefix because that triggers +# name mangling when trying to create a slot for the field +# (when slots=True) +_hash_cache_field = "_attrs_cached_hash" _empty_metadata_singleton = metadata_proxy({}) @@ -446,12 +450,15 @@ class _ClassBuilder(object): "_attr_names", "_slots", "_frozen", + "_cache_hash", "_has_post_init", "_delete_attribs", "_super_attr_map", ) - def __init__(self, cls, these, slots, frozen, auto_attribs, kw_only): + def __init__( + self, cls, these, slots, frozen, auto_attribs, kw_only, cache_hash + ): attrs, super_attrs, super_map = _transform_attrs( cls, these, auto_attribs, kw_only ) @@ -464,6 +471,7 @@ class _ClassBuilder(object): self._attr_names = tuple(a.name for a in attrs) self._slots = slots self._frozen = frozen or _has_frozen_superclass(cls) + self._cache_hash = cache_hash self._has_post_init = bool(getattr(cls, "__attrs_post_init__", False)) self._delete_attribs = not bool(these) @@ -522,9 +530,12 @@ class _ClassBuilder(object): # We only add the names of attributes that aren't inherited. # Settings __slots__ to inherited attributes wastes memory. - cd["__slots__"] = tuple( + slot_names = [ name for name in self._attr_names if name not in super_names - ) + ] + if self._cache_hash: + slot_names.append(_hash_cache_field) + cd["__slots__"] = tuple(slot_names) qualname = getattr(self._cls, "__qualname__", None) if qualname is not None: @@ -603,7 +614,9 @@ class _ClassBuilder(object): def add_hash(self): self._cls_dict["__hash__"] = self._add_method_dunders( - _make_hash(self._attrs) + _make_hash( + self._attrs, frozen=self._frozen, cache_hash=self._cache_hash + ) ) return self @@ -615,6 +628,7 @@ class _ClassBuilder(object): self._has_post_init, self._frozen, self._slots, + self._cache_hash, self._super_attr_map, ) ) @@ -664,6 +678,7 @@ def attrs( str=False, auto_attribs=False, kw_only=False, + cache_hash=False, ): r""" A class decorator that adds `dunder @@ -764,6 +779,12 @@ def attrs( :param bool kw_only: Make all attributes keyword-only (Python 3+) in the generated ``__init__`` (if ``init`` is ``False``, this parameter is ignored). + :param bool cache_hash: Ensure that the object's hash code is computed + only once and stored on the object. If this is set to ``True``, + hashing must be either explicitly or implicitly enabled for this + class. If the hash code is cached, then no attributes of this + class which participate in hash code computation may be mutated + after object creation. .. versionadded:: 16.0.0 *slots* @@ -782,6 +803,7 @@ def attrs( each other. ``__eq`` and ``__ne__`` never tried to compared subclasses to each other. .. versionadded:: 18.2.0 *kw_only* + .. versionadded:: 18.2.0 *cache_hash* """ def wrap(cls): @@ -789,7 +811,7 @@ def attrs( raise TypeError("attrs only works with new-style classes.") builder = _ClassBuilder( - cls, these, slots, frozen, auto_attribs, kw_only + cls, these, slots, frozen, auto_attribs, kw_only, cache_hash ) if repr is True: @@ -805,14 +827,31 @@ def attrs( "Invalid value for hash. Must be True, False, or None." ) elif hash is False or (hash is None and cmp is False): - pass + if cache_hash: + raise TypeError( + "Invalid value for cache_hash. To use hash caching," + " hashing must be either explicitly or implicitly " + "enabled." + ) elif hash is True or (hash is None and cmp is True and frozen is True): builder.add_hash() else: + if cache_hash: + raise TypeError( + "Invalid value for cache_hash. To use hash caching," + " hashing must be either explicitly or implicitly " + "enabled." + ) builder.make_unhashable() if init is True: builder.add_init() + else: + if cache_hash: + raise TypeError( + "Invalid value for cache_hash. To use hash caching," + " init must be True." + ) return builder.build_class() @@ -862,29 +901,54 @@ def _attrs_to_tuple(obj, attrs): return tuple(getattr(obj, a.name) for a in attrs) -def _make_hash(attrs): +def _make_hash(attrs, frozen, cache_hash): attrs = tuple( a for a in attrs if a.hash is True or (a.hash is None and a.cmp is True) ) + tab = " " + # We cache the generated hash methods for the same kinds of attributes. sha1 = hashlib.sha1() sha1.update(repr(attrs).encode("utf-8")) unique_filename = "" % (sha1.hexdigest(),) type_hash = hash(unique_filename) - lines = [ - "def __hash__(self):", - " return hash((", - " %d," % (type_hash,), - ] - for a in attrs: - lines.append(" self.%s," % (a.name)) - lines.append(" ))") + method_lines = ["def __hash__(self):"] - script = "\n".join(lines) + def append_hash_computation_lines(prefix, indent): + """ + Generate the code for actually computing the hash code. + Below this will either be returned directly or used to compute + a value which is then cached, depending on the value of cache_hash + """ + method_lines.extend( + [indent + prefix + "hash((", indent + " %d," % (type_hash,)] + ) + + for a in attrs: + method_lines.append(indent + " self.%s," % a.name) + + method_lines.append(indent + " ))") + + if cache_hash: + method_lines.append(tab + "if self.%s is None:" % _hash_cache_field) + if frozen: + append_hash_computation_lines( + "object.__setattr__(self, '%s', " % _hash_cache_field, tab * 2 + ) + method_lines.append(tab * 2 + ")") # close __setattr__ + else: + append_hash_computation_lines( + "self.%s = " % _hash_cache_field, tab * 2 + ) + method_lines.append(tab + "return self.%s" % _hash_cache_field) + else: + append_hash_computation_lines("return ", tab) + + script = "\n".join(method_lines) globs = {} locs = {} bytecode = compile(script, unique_filename, "exec") @@ -906,7 +970,7 @@ def _add_hash(cls, attrs): """ Add a hash method to *cls*. """ - cls.__hash__ = _make_hash(attrs) + cls.__hash__ = _make_hash(attrs, frozen=False, cache_hash=False) return cls @@ -1108,7 +1172,7 @@ def _add_repr(cls, ns=None, attrs=None): return cls -def _make_init(attrs, post_init, frozen, slots, super_attr_map): +def _make_init(attrs, post_init, frozen, slots, cache_hash, super_attr_map): attrs = [a for a in attrs if a.init or a.default is not NOTHING] # We cache the generated init methods for the same kinds of attributes. @@ -1117,7 +1181,7 @@ def _make_init(attrs, post_init, frozen, slots, super_attr_map): unique_filename = "".format(sha1.hexdigest()) script, globs, annotations = _attrs_to_init_script( - attrs, frozen, slots, post_init, super_attr_map + attrs, frozen, slots, post_init, cache_hash, super_attr_map ) locs = {} bytecode = compile(script, unique_filename, "exec") @@ -1152,7 +1216,8 @@ def _add_init(cls, frozen): getattr(cls, "__attrs_post_init__", False), frozen, _is_slot_cls(cls), - {}, + cache_hash=False, + super_attr_map={}, ) return cls @@ -1241,7 +1306,9 @@ def _is_slot_attr(a_name, super_attr_map): return a_name in super_attr_map and _is_slot_cls(super_attr_map[a_name]) -def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map): +def _attrs_to_init_script( + attrs, frozen, slots, post_init, cache_hash, super_attr_map +): """ Return a script of an initializer for *attrs* and a dict of globals. @@ -1259,6 +1326,7 @@ def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map): lines.append( # Circumvent the __setattr__ descriptor to save one lookup per # assignment. + # Note _setattr will be used again below if cache_hash is True "_setattr = _cached_setattr.__get__(self, self.__class__)" ) @@ -1280,6 +1348,7 @@ def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map): # Dict frozen classes assign directly to __dict__. # But only if the attribute doesn't come from an ancestor slot # class. + # Note _inst_dict will be used again below if cache_hash is True lines.append("_inst_dict = self.__dict__") if any_slot_ancestors: lines.append( @@ -1470,6 +1539,23 @@ def _attrs_to_init_script(attrs, frozen, slots, post_init, super_attr_map): if post_init: lines.append("self.__attrs_post_init__()") + # because this is set only after __attrs_post_init is called, a crash + # will result if post-init tries to access the hash code. This seemed + # preferable to setting this beforehand, in which case alteration to + # field values during post-init combined with post-init accessing the + # hash code would result in silent bugs. + if cache_hash: + if frozen: + if slots: + # if frozen and slots, then _setattr defined above + init_hash_cache = "_setattr('%s', %s)" + else: + # if frozen and not slots, then _inst_dict defined above + init_hash_cache = "_inst_dict['%s'] = %s" + else: + init_hash_cache = "self.%s = %s" + lines.append(init_hash_cache % (_hash_cache_field, "None")) + args = ", ".join(args) if kw_only_args: if PY2: diff --git a/tests/test_dunders.py b/tests/test_dunders.py index 25690611..59d5764a 100644 --- a/tests/test_dunders.py +++ b/tests/test_dunders.py @@ -33,9 +33,19 @@ ReprC = simple_class(repr=True) ReprCSlots = simple_class(repr=True, slots=True) # HashC is hashable by explicit definition while HashCSlots is hashable -# implicitly. +# implicitly. The "Cached" versions are the same, except with hash code +# caching enabled HashC = simple_class(hash=True) HashCSlots = simple_class(hash=None, cmp=True, frozen=True, slots=True) +HashCCached = simple_class(hash=True, cache_hash=True) +HashCSlotsCached = simple_class( + hash=None, cmp=True, frozen=True, slots=True, cache_hash=True +) +# the cached hash code is stored slightly differently in this case +# so it needs to be tested separately +HashCFrozenNotSlotsCached = simple_class( + frozen=True, slots=False, hash=True, cache_hash=True +) class InitC(object): @@ -283,8 +293,42 @@ class TestAddHash(object): assert exc_args == e.value.args - @given(booleans()) - def test_hash_attribute(self, slots): + def test_enforce_no_cache_hash_without_hash(self): + """ + Ensure exception is thrown if caching the hash code is requested + but attrs is not requested to generate `__hash__`. + """ + exc_args = ( + "Invalid value for cache_hash. To use hash caching," + " hashing must be either explicitly or implicitly " + "enabled.", + ) + with pytest.raises(TypeError) as e: + make_class("C", {}, hash=False, cache_hash=True) + assert exc_args == e.value.args + + # unhashable case + with pytest.raises(TypeError) as e: + make_class( + "C", {}, hash=None, cmp=True, frozen=False, cache_hash=True + ) + assert exc_args == e.value.args + + def test_enforce_no_cached_hash_without_init(self): + """ + Ensure exception is thrown if caching the hash code is requested + but attrs is not requested to generate `__init__`. + """ + exc_args = ( + "Invalid value for cache_hash. To use hash caching," + " init must be True.", + ) + with pytest.raises(TypeError) as e: + make_class("C", {}, init=False, hash=True, cache_hash=True) + assert exc_args == e.value.args + + @given(booleans(), booleans()) + def test_hash_attribute(self, slots, cache_hash): """ If `hash` is False on an attribute, ignore that attribute. """ @@ -293,6 +337,7 @@ class TestAddHash(object): {"a": attr.ib(hash=False), "b": attr.ib()}, slots=slots, hash=True, + cache_hash=cache_hash, ) assert hash(C(1, 2)) == hash(C(2, 2)) @@ -331,12 +376,26 @@ class TestAddHash(object): assert C(1) != C(1) assert hash(C(1)) != hash(C(1)) - @pytest.mark.parametrize("cls", [HashC, HashCSlots]) + @pytest.mark.parametrize( + "cls", + [ + HashC, + HashCSlots, + HashCCached, + HashCSlotsCached, + HashCFrozenNotSlotsCached, + ], + ) def test_hash_works(self, cls): """ __hash__ returns different hashes for different values. """ - assert hash(cls(1, 2)) != hash(cls(1, 1)) + a = cls(1, 2) + b = cls(1, 1) + assert hash(a) != hash(b) + # perform the test again to test the pre-cached path through + # __hash__ for the cached-hash versions + assert hash(a) != hash(b) def test_hash_default(self): """ @@ -352,6 +411,48 @@ class TestAddHash(object): "unhashable type: 'C'", # CPython ) + def test_cache_hashing(self): + """ + Ensure that hash computation if cached if and only if requested + """ + + class HashCounter: + """ + A class for testing which counts how many times its hash + has been requested + """ + + def __init__(self): + self.times_hash_called = 0 + + def __hash__(self): + self.times_hash_called += 1 + return 12345 + + Uncached = make_class( + "Uncached", + {"hash_counter": attr.ib(factory=HashCounter)}, + hash=True, + cache_hash=False, + ) + Cached = make_class( + "Cached", + {"hash_counter": attr.ib(factory=HashCounter)}, + hash=True, + cache_hash=True, + ) + + uncached_instance = Uncached() + cached_instance = Cached() + + hash(uncached_instance) + hash(uncached_instance) + hash(cached_instance) + hash(cached_instance) + + assert 2 == uncached_instance.hash_counter.times_hash_called + assert 1 == cached_instance.hash_counter.times_hash_called + class TestAddInit(object): """ diff --git a/tests/test_make.py b/tests/test_make.py index 0302cb5f..5e7565b6 100644 --- a/tests/test_make.py +++ b/tests/test_make.py @@ -1368,7 +1368,7 @@ class TestClassBuilder(object): class C(object): pass - b = _ClassBuilder(C, None, True, True, False, False) + b = _ClassBuilder(C, None, True, True, False, False, False) assert "<_ClassBuilder(cls=C)>" == repr(b) @@ -1380,7 +1380,7 @@ class TestClassBuilder(object): class C(object): x = attr.ib() - b = _ClassBuilder(C, None, True, True, False, False) + b = _ClassBuilder(C, None, True, True, False, False, False) cls = ( b.add_cmp() @@ -1443,6 +1443,7 @@ class TestClassBuilder(object): frozen=False, auto_attribs=False, kw_only=False, + cache_hash=False, ) b._cls = {} # no __module__; no __qualname__ diff --git a/tests/utils.py b/tests/utils.py index 230726c9..5a5cf073 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -9,7 +9,13 @@ from attr._make import NOTHING, make_class def simple_class( - cmp=False, repr=False, hash=False, str=False, slots=False, frozen=False + cmp=False, + repr=False, + hash=False, + str=False, + slots=False, + frozen=False, + cache_hash=False, ): """ Return a new simple class. @@ -24,6 +30,7 @@ def simple_class( slots=slots, str=str, frozen=frozen, + cache_hash=cache_hash, )