From d2be76c964aea3b0648d547c930f9633f577b257 Mon Sep 17 00:00:00 2001 From: Ryan Gabbard Date: Sat, 2 Feb 2019 08:32:01 -0500 Subject: [PATCH] Clear cache hash on de-serialization (#489) * Don't cache hash codes across deserialization. Because the hash code cache field gets serialized and deserialized by Pickle, previously when you deserialize a cache_hash=True attrs object, the hashcode will be the hashcode the object had at serialization-time. However, if your object had fields with hash codes which were not deterministic between interpreter runs, then on a new interpreter run your deserialized object would have a hash code which differs from a newly created identical object. This commit fixes that by clearing the cache on deserialization. It needs to override the __setstate__ method to do so, so this commit also forbids using a custom __setstate__ on a cache_hash=True object. Closes #482 . * Improve exception type * Minor tweaks to comments formatting * Fix test for Python 2.7 * Fix error in comment * Make nomenclature consistent for slotted/dict classes * Remove hasattr() * Improve comments and error message for custom __setstate__ + cache_hash=True * Drop use of for test classes for cache hash serialization * Make changelog note this is a breaking change * Make exception message point to tracking issue * Fix test classes for Python 2.7 --- changelog.d/482.breaking.rst | 4 ++ src/attr/_make.py | 29 ++++++++++++++ tests/test_dunders.py | 78 ++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+) create mode 100644 changelog.d/482.breaking.rst diff --git a/changelog.d/482.breaking.rst b/changelog.d/482.breaking.rst new file mode 100644 index 000000000..0eb5cb4fc --- /dev/null +++ b/changelog.d/482.breaking.rst @@ -0,0 +1,4 @@ +Fixed a bug where deserialized objects with ``cache_hash=True`` could have incorrect hash code values. +This change breaks classes with ``cache_hash=True`` when a custom ``__setstate__`` is present. +An exception will be thrown when applying the ``attrs`` annotation to such a class. +This limitation is tracked in issue https://github.com/python-attrs/attrs/issues/494 . diff --git a/src/attr/_make.py b/src/attr/_make.py index 6d72df4af..1373536af 100644 --- a/src/attr/_make.py +++ b/src/attr/_make.py @@ -529,6 +529,26 @@ def _patch_original_class(self): for name, value in self._cls_dict.items(): setattr(cls, name, value) + # Attach __setstate__. This is necessary to clear the hash code + # cache on deserialization. See issue + # https://github.com/python-attrs/attrs/issues/482 . + # Note that this code only handles setstate for dict classes. + # For slotted classes, see similar code in _create_slots_class . + if self._cache_hash: + existing_set_state_method = getattr(cls, "__setstate__", None) + if existing_set_state_method: + raise NotImplementedError( + "Currently you cannot use hash caching if " + "you specify your own __setstate__ method." + "See https://github.com/python-attrs/attrs/issues/494 ." + ) + + def cache_hash_set_state(chss_self, _): + # clear hash code cache + setattr(chss_self, _hash_cache_field, None) + + setattr(cls, "__setstate__", cache_hash_set_state) + return cls def _create_slots_class(self): @@ -581,6 +601,8 @@ def slots_getstate(self): """ return tuple(getattr(self, name) for name in state_attr_names) + hash_caching_enabled = self._cache_hash + def slots_setstate(self, state): """ Automatically created by attrs. @@ -588,6 +610,13 @@ def slots_setstate(self, state): __bound_setattr = _obj_setattr.__get__(self, Attribute) for name, value in zip(state_attr_names, state): __bound_setattr(name, value) + # Clearing the hash code cache on deserialization is needed + # because hash codes can change from run to run. See issue + # https://github.com/python-attrs/attrs/issues/482 . + # Note that this code only handles setstate for slotted classes. + # For dict classes, see similar code in _patch_original_class . + if hash_caching_enabled: + __bound_setattr(_hash_cache_field, None) # slots and frozen require __getstate__/__setstate__ to work cd["__getstate__"] = slots_getstate diff --git a/tests/test_dunders.py b/tests/test_dunders.py index 59d5764a1..468a56982 100644 --- a/tests/test_dunders.py +++ b/tests/test_dunders.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, division, print_function import copy +import pickle import pytest @@ -453,6 +454,83 @@ def __hash__(self): assert 2 == uncached_instance.hash_counter.times_hash_called assert 1 == cached_instance.hash_counter.times_hash_called + def test_cache_hash_serialization(self): + """ + Tests that the hash cache is cleared on deserialization to fix + https://github.com/python-attrs/attrs/issues/482 . + """ + + # First, check that our fix didn't break serialization without + # hash caching. + # We don't care about the result of this; we just want to make sure we + # can do it without exceptions. + hash(pickle.loads(pickle.dumps(HashCacheSerializationTestUncached))) + + def assert_hash_code_not_cached_across_serialization(original): + # Now check our fix for #482 for when hash caching is enabled. + original_hash = hash(original) + round_tripped = pickle.loads(pickle.dumps(original)) + # What we want to guard against is having a stale hash code + # when a field's hash code differs in a new interpreter after + # deserialization. This is tricky to test because we are, + # of course, still running in the same interpreter. So + # after deserialization we reach in and change the value of + # a field to simulate the field changing its hash code. We then + # check that the object's hash code changes, indicating that we + # don't have a stale hash code. + # This could fail in two ways: (1) pickle.loads could get the hash + # code of the deserialized value (triggering it to cache) before + # we alter the field value. This doesn't happen in our tested + # Python versions. (2) "foo" and "something different" could + # have a hash collision on this interpreter run. But this is + # extremely improbable and would just result in one buggy test run. + round_tripped.foo_string = "something different" + assert original_hash != hash(round_tripped) + + # Slots and non-slots classes implement __setstate__ differently, + # so we need to test both cases. + assert_hash_code_not_cached_across_serialization( + HashCacheSerializationTestCached() + ) + assert_hash_code_not_cached_across_serialization( + HashCacheSerializationTestCachedSlots() + ) + + # Test that a custom __setstate__ is disallowed on a + # cache_hash=True object. + # This is needed because we handle clearing the cache after + # deserialization with a custom __setstate__. It is possible + # to make both work, but it requires some thought about how to go + # about it, so it has not yet been implemented. + with pytest.raises( + NotImplementedError, + message="Currently you cannot use hash caching if you " + "specify your own __setstate__ method. This is tracked" + "by https://github.com/python-attrs/attrs/issues/494", + ): + + @attr.attrs(hash=True, cache_hash=True) + class NoCacheHashAndCustomSetState(object): + def __setstate__(self, state): + pass + + +# these are for use in TestAddHash.test_cache_hash_serialization +# they need to be out here so they can be un-pickled +@attr.attrs(hash=True, cache_hash=False) +class HashCacheSerializationTestUncached(object): + foo_string = attr.ib(default="foo") + + +@attr.attrs(hash=True, cache_hash=True) +class HashCacheSerializationTestCached(object): + foo_string = attr.ib(default="foo") + + +@attr.attrs(slots=True, hash=True, cache_hash=True) +class HashCacheSerializationTestCachedSlots(object): + foo_string = attr.ib(default="foo") + class TestAddInit(object): """