Merge pull request #2036 from youknowone/dictkey-hash

DictKey for str shares same hash with PyString
This commit is contained in:
Jeong YunWon
2020-08-06 17:47:27 +09:00
committed by GitHub
5 changed files with 61 additions and 54 deletions

View File

@@ -23,6 +23,17 @@ pub const SEED_BITS: usize = std::mem::size_of::<PyHash>() * 2 * 8;
// pub const CUTOFF: usize = 7;
#[inline]
pub fn mod_int(value: i64) -> PyHash {
value % MODULUS as i64
}
pub fn hash_value<T: Hash + ?Sized>(data: &T) -> PyHash {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
mod_int(hasher.finish() as PyHash)
}
pub fn hash_float(value: f64) -> PyHash {
// cpython _Py_HashDouble
if !value.is_finite() {
@@ -68,12 +79,6 @@ pub fn hash_float(value: f64) -> PyHash {
x as PyHash * value.signum() as PyHash
}
pub fn hash_value<T: Hash + ?Sized>(data: &T) -> PyHash {
let mut hasher = DefaultHasher::new();
data.hash(&mut hasher);
hasher.finish() as PyHash
}
pub fn hash_iter<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
where
I: IntoIterator<Item = &'a T>,
@@ -84,7 +89,7 @@ where
let item_hash = hashf(element)?;
item_hash.hash(&mut hasher);
}
Ok(hasher.finish() as PyHash)
Ok(mod_int(hasher.finish() as PyHash))
}
pub fn hash_iter_unordered<'a, T: 'a, I, F, E>(iter: I, hashf: F) -> Result<PyHash, E>
@@ -98,12 +103,20 @@ where
// xor is commutative and hash should be independent of order
hash ^= item_hash;
}
Ok(hash)
Ok(mod_int(hash))
}
pub fn hash_bigint(value: &BigInt) -> PyHash {
match value.to_i64() {
Some(i64_value) => (i64_value % MODULUS as i64),
None => (value % MODULUS).to_i64().unwrap(),
}
value.to_i64().map_or_else(
|| {
(value % MODULUS).to_i64().unwrap_or_else(||
// guaranteed to be safe by mod
unsafe { std::hint::unreachable_unchecked() })
},
mod_int,
)
}
pub fn hash_str(value: &str) -> PyHash {
hash_value(value)
}

View File

@@ -1,17 +1,18 @@
use crate::common::cell::{PyRwLock, PyRwLockReadGuard, PyRwLockWriteGuard};
use crate::obj::objstr::{PyString, PyStringRef};
use crate::pyobject::{IdProtocol, IntoPyObject, PyObjectRef, PyResult};
use crate::vm::VirtualMachine;
use num_bigint::ToBigInt;
use rustpython_common::hash;
/// Ordered dictionary implementation.
/// Inspired by: https://morepypy.blogspot.com/2015/01/faster-more-memory-efficient-and-more.html
/// And: https://www.youtube.com/watch?v=p33CVV29OG8
/// And: http://code.activestate.com/recipes/578375/
use std::collections::{hash_map::DefaultHasher, HashMap};
use std::hash::{Hash, Hasher};
use crate::common::cell::{PyRwLock, PyRwLockReadGuard, PyRwLockWriteGuard};
use crate::obj::objstr::{PyString, PyStringRef};
use crate::pyobject::{IdProtocol, IntoPyObject, PyObjectRef, PyResult};
use crate::vm::VirtualMachine;
use rustpython_common::hash;
use std::collections::HashMap;
use std::mem::size_of;
// HashIndex is intended to be same size with hash::PyHash
// but it doesn't mean the values are compatible with actual pyhash value
/// hash value of an object returned by __hash__
type HashValue = hash::PyHash;
/// index calculated by resolving collision
@@ -303,7 +304,7 @@ impl<T: Clone> Dict<T> {
/// Lookup the index for the given key.
#[cfg_attr(feature = "flame-it", flame("Dict"))]
fn lookup<K: DictKey>(&self, vm: &VirtualMachine, key: &K) -> PyResult<LookupResult> {
let hash_value = key.do_hash(vm)?;
let hash_value = key.key_hash(vm)?;
let perturb = hash_value;
let mut hash_index: HashIndex = hash_value;
'outer: loop {
@@ -314,7 +315,7 @@ impl<T: Clone> Dict<T> {
let index = inner.indices[&hash_index];
if let Some(entry) = &inner.entries[index] {
// Okay, we have an entry at this place
if key.do_is(&entry.key) {
if key.key_is(&entry.key) {
// Literally the same object
break 'outer Ok(LookupResult::Existing(index));
} else if entry.hash == hash_value {
@@ -337,7 +338,7 @@ impl<T: Clone> Dict<T> {
// warn!("Perturb value: {}", i);
};
// This comparison needs to be done outside the lock.
if key.do_eq(vm, &entry.key)? {
if key.key_eq(vm, &entry.key)? {
break Ok(LookupResult::Existing(index));
} else {
// entry mismatch.
@@ -412,40 +413,37 @@ enum LookupResult {
/// - PyObjectRef -> arbitrary python type used as key
/// - str -> string reference used as key, this is often used internally
pub trait DictKey: IntoPyObject {
fn do_hash(&self, vm: &VirtualMachine) -> PyResult<HashValue>;
fn do_is(&self, other: &PyObjectRef) -> bool;
fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool>;
fn key_hash(&self, vm: &VirtualMachine) -> PyResult<HashValue>;
fn key_is(&self, other: &PyObjectRef) -> bool;
fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool>;
}
/// Implement trait for PyObjectRef such that we can use python objects
/// to index dictionaries.
impl DictKey for PyObjectRef {
fn do_hash(&self, vm: &VirtualMachine) -> PyResult<HashValue> {
let raw_hash = vm._hash(self)?;
let mut hasher = DefaultHasher::new();
raw_hash.hash(&mut hasher);
Ok(hasher.finish() as HashValue)
fn key_hash(&self, vm: &VirtualMachine) -> PyResult<HashValue> {
vm._hash(self)
}
fn do_is(&self, other: &PyObjectRef) -> bool {
fn key_is(&self, other: &PyObjectRef) -> bool {
self.is(other)
}
fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool> {
fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool> {
vm.identical_or_equal(self, other_key)
}
}
impl DictKey for PyStringRef {
fn do_hash(&self, _vm: &VirtualMachine) -> PyResult<HashValue> {
fn key_hash(&self, _vm: &VirtualMachine) -> PyResult<HashValue> {
Ok(self.hash())
}
fn do_is(&self, other: &PyObjectRef) -> bool {
fn key_is(&self, other: &PyObjectRef) -> bool {
self.is(other)
}
fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool> {
fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool> {
if self.is(other_key) {
Ok(true)
} else if let Some(py_str_value) = other_key.payload::<PyString>() {
@@ -461,28 +459,24 @@ impl DictKey for PyStringRef {
/// Implement trait for the str type, so that we can use strings
/// to index dictionaries.
impl DictKey for &str {
fn do_hash(&self, _vm: &VirtualMachine) -> PyResult<HashValue> {
fn key_hash(&self, _vm: &VirtualMachine) -> PyResult<HashValue> {
// follow a similar route as the hashing of PyStringRef
let raw_hash = hash::hash_value(*self).to_bigint().unwrap();
let raw_hash = hash::hash_bigint(&raw_hash);
let mut hasher = DefaultHasher::new();
raw_hash.hash(&mut hasher);
Ok(hasher.finish() as HashValue)
Ok(hash::hash_str(*self))
}
fn do_is(&self, _other: &PyObjectRef) -> bool {
fn key_is(&self, _other: &PyObjectRef) -> bool {
// No matter who the other pyobject is, we are never the same thing, since
// we are a str, not a pyobject.
false
}
fn do_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool> {
fn key_eq(&self, vm: &VirtualMachine, other_key: &PyObjectRef) -> PyResult<bool> {
if let Some(py_str_value) = other_key.payload::<PyString>() {
Ok(py_str_value.as_str() == *self)
} else {
// Fall back to PyString implementation.
// Fall back to PyObjectRef implementation.
let s = vm.ctx.new_str(*self);
s.do_eq(vm, other_key)
s.key_eq(vm, other_key)
}
}
}
@@ -545,8 +539,8 @@ mod tests {
let value1 = text;
let value2 = vm.new_str(value1.to_owned());
let hash1 = value1.do_hash(&vm).expect("Hash should not fail.");
let hash2 = value2.do_hash(&vm).expect("Hash should not fail.");
let hash1 = value1.key_hash(&vm).expect("Hash should not fail.");
let hash2 = value2.key_hash(&vm).expect("Hash should not fail.");
assert_eq!(hash1, hash2);
}
}

View File

@@ -419,7 +419,7 @@ impl PyInt {
}
#[pymethod(name = "__hash__")]
pub fn hash(&self) -> hash::PyHash {
fn hash(&self) -> hash::PyHash {
hash::hash_bigint(&self.value)
}

View File

@@ -283,7 +283,7 @@ impl PyString {
#[pymethod(name = "__hash__")]
pub(crate) fn hash(&self) -> hash::PyHash {
self.hash.load().unwrap_or_else(|| {
let hash = hash::hash_value(&self.value);
let hash = hash::hash_str(&self.value);
self.hash.store(Some(hash));
hash
})

View File

@@ -1407,11 +1407,11 @@ impl VirtualMachine {
}
pub fn _hash(&self, obj: &PyObjectRef) -> PyResult<rustpython_common::hash::PyHash> {
// TODO: tp_hash
let hash_obj = self.call_method(obj, "__hash__", vec![])?;
if let Some(hash_value) = hash_obj.payload_if_subclass::<PyInt>(self) {
Ok(hash_value.hash())
} else {
Err(self.new_type_error("__hash__ method should return an integer".to_owned()))
match hash_obj.payload_if_subclass::<PyInt>(self) {
Some(py_int) => Ok(rustpython_common::hash::hash_bigint(py_int.as_bigint())),
None => Err(self.new_type_error("__hash__ method should return an integer".to_owned())),
}
}