Skip to content

Commit

Permalink
Clear cache hash on de-serialization (python-attrs#489)
Browse files Browse the repository at this point in the history
* Don't cache hash codes across deserialization.

 Because the hash code cache field gets serialized and deserialized by
 Pickle, previously when you deserialize a cache_hash=True attrs object, the
 hashcode will be the hashcode the object had at serialization-time.
 However, if your object had fields with hash codes which were not
 deterministic between interpreter runs, then on a new interpreter run
 your deserialized object would have a hash code which differs from a
 newly created identical object.

 This commit fixes that by clearing the cache on deserialization. It
 needs to override the __setstate__ method to do so, so this commit
 also forbids using a custom __setstate__ on a cache_hash=True object.

Closes python-attrs#482 .

* Improve exception type

* Minor tweaks to comments formatting

* Fix test for Python 2.7

* Fix error in comment

* Make nomenclature consistent for slotted/dict classes

* Remove hasattr()

* Improve comments and error message for custom __setstate__ + cache_hash=True

* Drop use of  for test classes for cache hash serialization

* Make changelog note this is a breaking change

* Make exception message point to tracking issue

* Fix test classes for Python 2.7
  • Loading branch information
Ryan Gabbard authored and hynek committed Feb 2, 2019
1 parent 1a90857 commit d2be76c
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 0 deletions.
4 changes: 4 additions & 0 deletions changelog.d/482.breaking.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fixed a bug where deserialized objects with ``cache_hash=True`` could have incorrect hash code values.
This change breaks classes with ``cache_hash=True`` when a custom ``__setstate__`` is present.
An exception will be thrown when applying the ``attrs`` annotation to such a class.
This limitation is tracked in issue https://github.com/python-attrs/attrs/issues/494 .
29 changes: 29 additions & 0 deletions src/attr/_make.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,26 @@ def _patch_original_class(self):
for name, value in self._cls_dict.items():
setattr(cls, name, value)

# Attach __setstate__. This is necessary to clear the hash code
# cache on deserialization. See issue
# https://github.com/python-attrs/attrs/issues/482 .
# Note that this code only handles setstate for dict classes.
# For slotted classes, see similar code in _create_slots_class .
if self._cache_hash:
existing_set_state_method = getattr(cls, "__setstate__", None)
if existing_set_state_method:
raise NotImplementedError(
"Currently you cannot use hash caching if "
"you specify your own __setstate__ method."
"See https://github.com/python-attrs/attrs/issues/494 ."
)

def cache_hash_set_state(chss_self, _):
# clear hash code cache
setattr(chss_self, _hash_cache_field, None)

setattr(cls, "__setstate__", cache_hash_set_state)

return cls

def _create_slots_class(self):
Expand Down Expand Up @@ -581,13 +601,22 @@ def slots_getstate(self):
"""
return tuple(getattr(self, name) for name in state_attr_names)

hash_caching_enabled = self._cache_hash

def slots_setstate(self, state):
"""
Automatically created by attrs.
"""
__bound_setattr = _obj_setattr.__get__(self, Attribute)
for name, value in zip(state_attr_names, state):
__bound_setattr(name, value)
# Clearing the hash code cache on deserialization is needed
# because hash codes can change from run to run. See issue
# https://github.com/python-attrs/attrs/issues/482 .
# Note that this code only handles setstate for slotted classes.
# For dict classes, see similar code in _patch_original_class .
if hash_caching_enabled:
__bound_setattr(_hash_cache_field, None)

# slots and frozen require __getstate__/__setstate__ to work
cd["__getstate__"] = slots_getstate
Expand Down
78 changes: 78 additions & 0 deletions tests/test_dunders.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import absolute_import, division, print_function

import copy
import pickle

import pytest

Expand Down Expand Up @@ -453,6 +454,83 @@ def __hash__(self):
assert 2 == uncached_instance.hash_counter.times_hash_called
assert 1 == cached_instance.hash_counter.times_hash_called

def test_cache_hash_serialization(self):
"""
Tests that the hash cache is cleared on deserialization to fix
https://github.com/python-attrs/attrs/issues/482 .
"""

# First, check that our fix didn't break serialization without
# hash caching.
# We don't care about the result of this; we just want to make sure we
# can do it without exceptions.
hash(pickle.loads(pickle.dumps(HashCacheSerializationTestUncached)))

def assert_hash_code_not_cached_across_serialization(original):
# Now check our fix for #482 for when hash caching is enabled.
original_hash = hash(original)
round_tripped = pickle.loads(pickle.dumps(original))
# What we want to guard against is having a stale hash code
# when a field's hash code differs in a new interpreter after
# deserialization. This is tricky to test because we are,
# of course, still running in the same interpreter. So
# after deserialization we reach in and change the value of
# a field to simulate the field changing its hash code. We then
# check that the object's hash code changes, indicating that we
# don't have a stale hash code.
# This could fail in two ways: (1) pickle.loads could get the hash
# code of the deserialized value (triggering it to cache) before
# we alter the field value. This doesn't happen in our tested
# Python versions. (2) "foo" and "something different" could
# have a hash collision on this interpreter run. But this is
# extremely improbable and would just result in one buggy test run.
round_tripped.foo_string = "something different"
assert original_hash != hash(round_tripped)

# Slots and non-slots classes implement __setstate__ differently,
# so we need to test both cases.
assert_hash_code_not_cached_across_serialization(
HashCacheSerializationTestCached()
)
assert_hash_code_not_cached_across_serialization(
HashCacheSerializationTestCachedSlots()
)

# Test that a custom __setstate__ is disallowed on a
# cache_hash=True object.
# This is needed because we handle clearing the cache after
# deserialization with a custom __setstate__. It is possible
# to make both work, but it requires some thought about how to go
# about it, so it has not yet been implemented.
with pytest.raises(
NotImplementedError,
message="Currently you cannot use hash caching if you "
"specify your own __setstate__ method. This is tracked"
"by https://github.com/python-attrs/attrs/issues/494",
):

@attr.attrs(hash=True, cache_hash=True)
class NoCacheHashAndCustomSetState(object):
def __setstate__(self, state):
pass


# these are for use in TestAddHash.test_cache_hash_serialization
# they need to be out here so they can be un-pickled
@attr.attrs(hash=True, cache_hash=False)
class HashCacheSerializationTestUncached(object):
foo_string = attr.ib(default="foo")


@attr.attrs(hash=True, cache_hash=True)
class HashCacheSerializationTestCached(object):
foo_string = attr.ib(default="foo")


@attr.attrs(slots=True, hash=True, cache_hash=True)
class HashCacheSerializationTestCachedSlots(object):
foo_string = attr.ib(default="foo")


class TestAddInit(object):
"""
Expand Down

0 comments on commit d2be76c

Please sign in to comment.