|
"""Cython wrapper for MurmurHash3 non-cryptographic hash function. |
|
|
|
MurmurHash is an extensively tested and very fast hash function that has |
|
good distribution properties suitable for machine learning use cases |
|
such as feature hashing and random projections. |
|
|
|
The original C++ code by Austin Appleby is released the public domain |
|
and can be found here: |
|
|
|
https://code.google.com/p/smhasher/ |
|
|
|
""" |
|
|
|
|
|
|
|
from ..utils._typedefs cimport int32_t, uint32_t |
|
|
|
import numpy as np |
|
|
|
cdef extern from "src/MurmurHash3.h": |
|
void MurmurHash3_x86_32(void *key, int len, uint32_t seed, void *out) |
|
void MurmurHash3_x86_128(void *key, int len, uint32_t seed, void *out) |
|
void MurmurHash3_x64_128 (void *key, int len, uint32_t seed, void *out) |
|
|
|
|
|
cpdef uint32_t murmurhash3_int_u32(int key, unsigned int seed): |
|
"""Compute the 32bit murmurhash3 of a int key at seed.""" |
|
cdef uint32_t out |
|
MurmurHash3_x86_32(&key, sizeof(int), seed, &out) |
|
return out |
|
|
|
|
|
cpdef int32_t murmurhash3_int_s32(int key, unsigned int seed): |
|
"""Compute the 32bit murmurhash3 of a int key at seed.""" |
|
cdef int32_t out |
|
MurmurHash3_x86_32(&key, sizeof(int), seed, &out) |
|
return out |
|
|
|
|
|
cpdef uint32_t murmurhash3_bytes_u32(bytes key, unsigned int seed): |
|
"""Compute the 32bit murmurhash3 of a bytes key at seed.""" |
|
cdef uint32_t out |
|
MurmurHash3_x86_32(<char*> key, len(key), seed, &out) |
|
return out |
|
|
|
|
|
cpdef int32_t murmurhash3_bytes_s32(bytes key, unsigned int seed): |
|
"""Compute the 32bit murmurhash3 of a bytes key at seed.""" |
|
cdef int32_t out |
|
MurmurHash3_x86_32(<char*> key, len(key), seed, &out) |
|
return out |
|
|
|
|
|
def _murmurhash3_bytes_array_u32( |
|
const int32_t[:] key, |
|
unsigned int seed, |
|
): |
|
"""Compute 32bit murmurhash3 hashes of a key int array at seed.""" |
|
|
|
cdef: |
|
uint32_t[:] out = np.zeros(key.size, np.uint32) |
|
Py_ssize_t i |
|
for i in range(key.shape[0]): |
|
out[i] = murmurhash3_int_u32(key[i], seed) |
|
return np.asarray(out) |
|
|
|
|
|
def _murmurhash3_bytes_array_s32( |
|
const int32_t[:] key, |
|
unsigned int seed, |
|
): |
|
"""Compute 32bit murmurhash3 hashes of a key int array at seed.""" |
|
|
|
cdef: |
|
int32_t[:] out = np.zeros(key.size, np.int32) |
|
Py_ssize_t i |
|
for i in range(key.shape[0]): |
|
out[i] = murmurhash3_int_s32(key[i], seed) |
|
return np.asarray(out) |
|
|
|
|
|
def murmurhash3_32(key, seed=0, positive=False): |
|
"""Compute the 32bit murmurhash3 of key at seed. |
|
|
|
The underlying implementation is MurmurHash3_x86_32 generating low |
|
latency 32bits hash suitable for implementing lookup tables, Bloom |
|
filters, count min sketch or feature hashing. |
|
|
|
Parameters |
|
---------- |
|
key : np.int32, bytes, unicode or ndarray of dtype=np.int32 |
|
The physical object to hash. |
|
|
|
seed : int, default=0 |
|
Integer seed for the hashing algorithm. |
|
|
|
positive : bool, default=False |
|
True: the results is casted to an unsigned int |
|
from 0 to 2 ** 32 - 1 |
|
False: the results is casted to a signed int |
|
from -(2 ** 31) to 2 ** 31 - 1 |
|
|
|
Examples |
|
-------- |
|
>>> from sklearn.utils import murmurhash3_32 |
|
>>> murmurhash3_32(b"Hello World!", seed=42) |
|
3565178 |
|
""" |
|
if isinstance(key, bytes): |
|
if positive: |
|
return murmurhash3_bytes_u32(key, seed) |
|
else: |
|
return murmurhash3_bytes_s32(key, seed) |
|
elif isinstance(key, unicode): |
|
if positive: |
|
return murmurhash3_bytes_u32(key.encode('utf-8'), seed) |
|
else: |
|
return murmurhash3_bytes_s32(key.encode('utf-8'), seed) |
|
elif isinstance(key, int) or isinstance(key, np.int32): |
|
if positive: |
|
return murmurhash3_int_u32(<int32_t>key, seed) |
|
else: |
|
return murmurhash3_int_s32(<int32_t>key, seed) |
|
elif isinstance(key, np.ndarray): |
|
if key.dtype != np.int32: |
|
raise TypeError( |
|
"key.dtype should be int32, got %s" % key.dtype) |
|
if positive: |
|
return _murmurhash3_bytes_array_u32(key.ravel(), seed).reshape(key.shape) |
|
else: |
|
return _murmurhash3_bytes_array_s32(key.ravel(), seed).reshape(key.shape) |
|
else: |
|
raise TypeError( |
|
"key %r with type %s is not supported. " |
|
"Explicit conversion to bytes is required" % (key, type(key))) |
|
|