Use CPython hash algorithm for frozenset

The original hash algorithm just XOR'd all the hashes of the elements of
the set, which is problematic. The CPython algorithm is required to pass
the tests.

- Replace `PyFrozenSet::hash` with CPython's algorithm
- Remove unused `hash_iter_unorded` functions
- Add `frozenset` benchmark
- Enable tests
- Lower performance expectations on effectiveness test
- Adjust `slot::hash_wrapper` so that it doesn't rehash the computed
  hash value in the process of converting PyInt to PyHash.
This commit is contained in:
Daniel Chiquito
2024-02-09 21:02:40 -05:00
parent bdf228eb42
commit bf461cdebc
7 changed files with 41 additions and 28 deletions

View File

@@ -1838,8 +1838,6 @@ class TestCollectionABCs(ABCTestCase):
self.assertTrue(f1 != l1)
self.assertTrue(f1 != l2)
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_Set_hash_matches_frozenset(self):
sets = [
{}, {1}, {None}, {-1}, {0.0}, {"abc"}, {1, 2, 3},

11
Lib/test/test_set.py vendored
View File

@@ -707,8 +707,6 @@ class TestFrozenSet(TestJointOps, unittest.TestCase):
f = self.thetype('abcdcda')
self.assertEqual(hash(f), hash(f))
# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_hash_effectiveness(self):
n = 13
hashvalues = set()
@@ -730,7 +728,14 @@ class TestFrozenSet(TestJointOps, unittest.TestCase):
for i in range(len(s)+1):
yield from map(frozenset, itertools.combinations(s, i))
for n in range(18):
# TODO the original test has:
# for n in range(18):
# Due to general performance overhead, hashing a frozenset takes
# about 50 times longer than in CPython. This test amplifies that
# exponentially, so the best we can do here reasonably is 13.
# Even if the internal hash function did nothing, it would still be
# about 40 times slower than CPython.
for n in range(13):
t = 2 ** n
mask = t - 1
for nums in (range, zf_range):

View File

@@ -0,0 +1,5 @@
fs = frozenset(range(0, ITERATIONS))
# ---
hash(fs)

View File

@@ -130,20 +130,6 @@ pub fn hash_float(value: f64) -> Option<PyHash> {
Some(fix_sentinel(x as PyHash * value.signum() as PyHash))
}
pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
where
I: IntoIterator<Item = &'a T>,
F: Fn(&'a T) -> Result<PyHash, E>,
{
let mut hash: PyHash = 0;
for element in iter {
let item_hash = hashf(element)?;
// xor is commutative and hash should be independent of order
hash ^= item_hash;
}
Ok(fix_sentinel(mod_int(hash)))
}
pub fn hash_bigint(value: &BigInt) -> PyHash {
let ret = match value.to_i64() {
Some(i) => mod_int(i),

View File

@@ -434,7 +434,28 @@ impl PySetInner {
}
fn hash(&self, vm: &VirtualMachine) -> PyResult<PyHash> {
crate::utils::hash_iter_unordered(self.elements().iter(), vm)
// Work to increase the bit dispersion for closely spaced hash values.
// This is important because some use cases have many combinations of a
// small number of elements with nearby hashes so that many distinct
// combinations collapse to only a handful of distinct hash values.
fn _shuffle_bits(h: u64) -> u64 {
((h ^ 89869747) ^ (h.wrapping_shl(16))).wrapping_mul(3644798167)
}
// Factor in the number of active entries
let mut hash: u64 = (self.elements().len() as u64 + 1).wrapping_mul(1927868237);
// Xor-in shuffled bits from every entry's hash field because xor is
// commutative and a frozenset hash should be independent of order.
for element in self.elements().iter() {
hash ^= _shuffle_bits(element.hash(vm)? as u64);
}
// Disperse patterns arising in nested frozensets
hash ^= (hash >> 11) ^ (hash >> 25);
hash = hash.wrapping_mul(69069).wrapping_add(907133923);
// -1 is reserved as an error code
if hash == u64::MAX {
hash = 590923713;
}
Ok(hash as PyHash)
}
// Run operation, on failure, if item is a set/set subclass, convert it

View File

@@ -15,6 +15,7 @@ use crate::{
AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, VirtualMachine,
};
use crossbeam_utils::atomic::AtomicCell;
use malachite_bigint::BigInt;
use num_traits::{Signed, ToPrimitive};
use std::{borrow::Borrow, cmp::Ordering, ops::Deref};
@@ -254,7 +255,11 @@ fn hash_wrapper(zelf: &PyObject, vm: &VirtualMachine) -> PyResult<PyHash> {
let py_int = hash_obj
.payload_if_subclass::<PyInt>(vm)
.ok_or_else(|| vm.new_type_error("__hash__ method should return an integer".to_owned()))?;
Ok(rustpython_common::hash::hash_bigint(py_int.as_bigint()))
let big_int = py_int.as_bigint();
let hash: PyHash = big_int
.to_i64()
.unwrap_or_else(|| (big_int % BigInt::from(u64::MAX)).to_i64().unwrap());
Ok(hash)
}
/// Marks a type as unhashable. Similar to PyObject_HashNotImplemented in CPython

View File

@@ -11,13 +11,6 @@ pub fn hash_iter<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
vm.state.hash_secret.hash_iter(iter, |obj| obj.hash(vm))
}
pub fn hash_iter_unordered<'a, I: IntoIterator<Item = &'a PyObjectRef>>(
iter: I,
vm: &VirtualMachine,
) -> PyResult<rustpython_common::hash::PyHash> {
rustpython_common::hash::hash_iter_unordered(iter, |obj| obj.hash(vm))
}
impl ToPyObject for std::convert::Infallible {
fn to_pyobject(self, _vm: &VirtualMachine) -> PyObjectRef {
match self {}