From bc8a0b9f6e7ba15708fcce7be1870ad39d91a77a Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Fri, 31 Jul 2020 03:37:59 +0900 Subject: [PATCH] DictKey for str shares same hash with PyString --- common/src/hash.rs | 37 +++++++++++++++-------- vm/src/dictdatatype.rs | 66 +++++++++++++++++++----------------------- vm/src/obj/objint.rs | 2 +- vm/src/obj/objstr.rs | 2 +- vm/src/vm.rs | 8 ++--- 5 files changed, 61 insertions(+), 54 deletions(-) diff --git a/common/src/hash.rs b/common/src/hash.rs index 65a11f224..58c554f43 100644 --- a/common/src/hash.rs +++ b/common/src/hash.rs @@ -23,6 +23,17 @@ pub const SEED_BITS: usize = std::mem::size_of::() * 2 * 8; // pub const CUTOFF: usize = 7; +#[inline] +pub fn mod_int(value: i64) -> PyHash { + value % MODULUS as i64 +} + +pub fn hash_value(data: &T) -> PyHash { + let mut hasher = DefaultHasher::new(); + data.hash(&mut hasher); + mod_int(hasher.finish() as PyHash) +} + pub fn hash_float(value: f64) -> PyHash { // cpython _Py_HashDouble if !value.is_finite() { @@ -68,12 +79,6 @@ pub fn hash_float(value: f64) -> PyHash { x as PyHash * value.signum() as PyHash } -pub fn hash_value(data: &T) -> PyHash { - let mut hasher = DefaultHasher::new(); - data.hash(&mut hasher); - hasher.finish() as PyHash -} - pub fn hash_iter<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result where I: IntoIterator, @@ -84,7 +89,7 @@ where let item_hash = hashf(element)?; item_hash.hash(&mut hasher); } - Ok(hasher.finish() as PyHash) + Ok(mod_int(hasher.finish() as PyHash)) } pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result @@ -98,12 +103,20 @@ where // xor is commutative and hash should be independent of order hash ^= item_hash; } - Ok(hash) + Ok(mod_int(hash)) } pub fn hash_bigint(value: &BigInt) -> PyHash { - match value.to_i64() { - Some(i64_value) => (i64_value % MODULUS as i64), - None => (value % MODULUS).to_i64().unwrap(), - } + value.to_i64().map_or_else( + || { + (value % MODULUS).to_i64().unwrap_or_else(|| + // guaranteed to be safe by mod + unsafe { std::hint::unreachable_unchecked() }) + }, + mod_int, + ) +} + +pub fn hash_str(value: &str) -> PyHash { + hash_value(value) } diff --git a/vm/src/dictdatatype.rs b/vm/src/dictdatatype.rs index 912455410..5a64c875f 100644 --- a/vm/src/dictdatatype.rs +++ b/vm/src/dictdatatype.rs @@ -1,17 +1,18 @@ -use crate::common::cell::{PyRwLock, PyRwLockReadGuard, PyRwLockWriteGuard}; -use crate::obj::objstr::{PyString, PyStringRef}; -use crate::pyobject::{IdProtocol, IntoPyObject, PyObjectRef, PyResult}; -use crate::vm::VirtualMachine; -use num_bigint::ToBigInt; -use rustpython_common::hash; /// Ordered dictionary implementation. /// Inspired by: https://morepypy.blogspot.com/2015/01/faster-more-memory-efficient-and-more.html /// And: https://www.youtube.com/watch?v=p33CVV29OG8 /// And: http://code.activestate.com/recipes/578375/ -use std::collections::{hash_map::DefaultHasher, HashMap}; -use std::hash::{Hash, Hasher}; +use crate::common::cell::{PyRwLock, PyRwLockReadGuard, PyRwLockWriteGuard}; +use crate::obj::objstr::{PyString, PyStringRef}; +use crate::pyobject::{IdProtocol, IntoPyObject, PyObjectRef, PyResult}; +use crate::vm::VirtualMachine; +use rustpython_common::hash; +use std::collections::HashMap; use std::mem::size_of; +// HashIndex is intended to be same size with hash::PyHash +// but it doesn't mean the values are compatible with actual pyhash value + /// hash value of an object returned by __hash__ type HashValue = hash::PyHash; /// index calculated by resolving collision @@ -303,7 +304,7 @@ impl Dict { /// Lookup the index for the given key. #[cfg_attr(feature = "flame-it", flame("Dict"))] fn lookup(&self, vm: &VirtualMachine, key: &K) -> PyResult { - let hash_value = key.do_hash(vm)?; + let hash_value = key.key_hash(vm)?; let perturb = hash_value; let mut hash_index: HashIndex = hash_value; 'outer: loop { @@ -314,7 +315,7 @@ impl Dict { let index = inner.indices[&hash_index]; if let Some(entry) = &inner.entries[index] { // Okay, we have an entry at this place - if key.do_is(&entry.key) { + if key.key_is(&entry.key) { // Literally the same object break 'outer Ok(LookupResult::Existing(index)); } else if entry.hash == hash_value { @@ -337,7 +338,7 @@ impl Dict { // warn!("Perturb value: {}", i); }; // This comparison needs to be done outside the lock. - if key.do_eq(vm, &entry.key)? { + if key.key_eq(vm, &entry.key)? { break Ok(LookupResult::Existing(index)); } else { // entry mismatch. @@ -412,40 +413,37 @@ enum LookupResult { /// - PyObjectRef -> arbitrary python type used as key /// - str -> string reference used as key, this is often used internally pub trait DictKey: IntoPyObject { - fn do_hash(&self, vm: &VirtualMachine) -> PyResult; - fn do_is(&self, other: &PyObjectRef) -> bool; - fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult; + fn key_hash(&self, vm: &VirtualMachine) -> PyResult; + fn key_is(&self, other: &PyObjectRef) -> bool; + fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult; } /// Implement trait for PyObjectRef such that we can use python objects /// to index dictionaries. impl DictKey for PyObjectRef { - fn do_hash(&self, vm: &VirtualMachine) -> PyResult { - let raw_hash = vm._hash(self)?; - let mut hasher = DefaultHasher::new(); - raw_hash.hash(&mut hasher); - Ok(hasher.finish() as HashValue) + fn key_hash(&self, vm: &VirtualMachine) -> PyResult { + vm._hash(self) } - fn do_is(&self, other: &PyObjectRef) -> bool { + fn key_is(&self, other: &PyObjectRef) -> bool { self.is(other) } - fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult { + fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult { vm.identical_or_equal(self, other_key) } } impl DictKey for PyStringRef { - fn do_hash(&self, _vm: &VirtualMachine) -> PyResult { + fn key_hash(&self, _vm: &VirtualMachine) -> PyResult { Ok(self.hash()) } - fn do_is(&self, other: &PyObjectRef) -> bool { + fn key_is(&self, other: &PyObjectRef) -> bool { self.is(other) } - fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult { + fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult { if self.is(other_key) { Ok(true) } else if let Some(py_str_value) = other_key.payload::() { @@ -461,28 +459,24 @@ impl DictKey for PyStringRef { /// Implement trait for the str type, so that we can use strings /// to index dictionaries. impl DictKey for &str { - fn do_hash(&self, _vm: &VirtualMachine) -> PyResult { + fn key_hash(&self, _vm: &VirtualMachine) -> PyResult { // follow a similar route as the hashing of PyStringRef - let raw_hash = hash::hash_value(*self).to_bigint().unwrap(); - let raw_hash = hash::hash_bigint(&raw_hash); - let mut hasher = DefaultHasher::new(); - raw_hash.hash(&mut hasher); - Ok(hasher.finish() as HashValue) + Ok(hash::hash_str(*self)) } - fn do_is(&self, _other: &PyObjectRef) -> bool { + fn key_is(&self, _other: &PyObjectRef) -> bool { // No matter who the other pyobject is, we are never the same thing, since // we are a str, not a pyobject. false } - fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult { + fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult { if let Some(py_str_value) = other_key.payload::() { Ok(py_str_value.as_str() == *self) } else { - // Fall back to PyString implementation. + // Fall back to PyObjectRef implementation. let s = vm.ctx.new_str(*self); - s.do_eq(vm, other_key) + s.key_eq(vm, other_key) } } } @@ -545,8 +539,8 @@ mod tests { let value1 = text; let value2 = vm.new_str(value1.to_owned()); - let hash1 = value1.do_hash(&vm).expect("Hash should not fail."); - let hash2 = value2.do_hash(&vm).expect("Hash should not fail."); + let hash1 = value1.key_hash(&vm).expect("Hash should not fail."); + let hash2 = value2.key_hash(&vm).expect("Hash should not fail."); assert_eq!(hash1, hash2); } } diff --git a/vm/src/obj/objint.rs b/vm/src/obj/objint.rs index 97be9bd6f..376979ea8 100644 --- a/vm/src/obj/objint.rs +++ b/vm/src/obj/objint.rs @@ -419,7 +419,7 @@ impl PyInt { } #[pymethod(name = "__hash__")] - pub fn hash(&self) -> hash::PyHash { + fn hash(&self) -> hash::PyHash { hash::hash_bigint(&self.value) } diff --git a/vm/src/obj/objstr.rs b/vm/src/obj/objstr.rs index cf71dccb2..8e35d3a44 100644 --- a/vm/src/obj/objstr.rs +++ b/vm/src/obj/objstr.rs @@ -283,7 +283,7 @@ impl PyString { #[pymethod(name = "__hash__")] pub(crate) fn hash(&self) -> hash::PyHash { self.hash.load().unwrap_or_else(|| { - let hash = hash::hash_value(&self.value); + let hash = hash::hash_str(&self.value); self.hash.store(Some(hash)); hash }) diff --git a/vm/src/vm.rs b/vm/src/vm.rs index 99234ff4d..a7f235ae2 100644 --- a/vm/src/vm.rs +++ b/vm/src/vm.rs @@ -1407,11 +1407,11 @@ impl VirtualMachine { } pub fn _hash(&self, obj: &PyObjectRef) -> PyResult { + // TODO: tp_hash let hash_obj = self.call_method(obj, "__hash__", vec![])?; - if let Some(hash_value) = hash_obj.payload_if_subclass::(self) { - Ok(hash_value.hash()) - } else { - Err(self.new_type_error("__hash__ method should return an integer".to_owned())) + match hash_obj.payload_if_subclass::(self) { + Some(py_int) => Ok(rustpython_common::hash::hash_bigint(py_int.as_bigint())), + None => Err(self.new_type_error("__hash__ method should return an integer".to_owned())), } }