Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,051 Bytes
f29eac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cooperative_groups.h>
#include <cooperative_groups/memcpy_async.h>
namespace cg = cooperative_groups;
#include "hilbert.h"
// Expands a 10-bit integer into 30 bits by inserting 2 zeros after each bit.
static __device__ uint32_t expandBits(uint32_t v)
{
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
// Removes 2 zeros after each bit in a 30-bit integer.
static __device__ uint32_t extractBits(uint32_t v)
{
v = v & 0x49249249;
v = (v ^ (v >> 2)) & 0x030C30C3u;
v = (v ^ (v >> 4)) & 0x0300F00Fu;
v = (v ^ (v >> 8)) & 0x030000FFu;
v = (v ^ (v >> 16)) & 0x000003FFu;
return v;
}
__global__ void hilbert_encode_cuda(
size_t N,
const uint32_t* x,
const uint32_t* y,
const uint32_t* z,
uint32_t* codes
) {
size_t thread_id = cg::this_grid().thread_rank();
if (thread_id >= N) return;
uint32_t point[3] = {x[thread_id], y[thread_id], z[thread_id]};
uint32_t m = 1 << 9, q, p, t;
// Inverse undo excess work
q = m;
while (q > 1) {
p = q - 1;
for (int i = 0; i < 3; i++) {
if (point[i] & q) {
point[0] ^= p; // invert
} else {
t = (point[0] ^ point[i]) & p;
point[0] ^= t;
point[i] ^= t;
}
}
q >>= 1;
}
// Gray encode
for (int i = 1; i < 3; i++) {
point[i] ^= point[i - 1];
}
t = 0;
q = m;
while (q > 1) {
if (point[2] & q) {
t ^= q - 1;
}
q >>= 1;
}
for (int i = 0; i < 3; i++) {
point[i] ^= t;
}
// Convert to 3D Hilbert code
uint32_t xx = expandBits(point[0]);
uint32_t yy = expandBits(point[1]);
uint32_t zz = expandBits(point[2]);
codes[thread_id] = xx * 4 + yy * 2 + zz;
}
__global__ void hilbert_decode_cuda(
size_t N,
const uint32_t* codes,
uint32_t* x,
uint32_t* y,
uint32_t* z
) {
size_t thread_id = cg::this_grid().thread_rank();
if (thread_id >= N) return;
uint32_t point[3];
point[0] = extractBits(codes[thread_id] >> 2);
point[1] = extractBits(codes[thread_id] >> 1);
point[2] = extractBits(codes[thread_id]);
uint32_t m = 2 << 9, q, p, t;
// Gray decode by H ^ (H/2)
t = point[2] >> 1;
for (int i = 2; i > 0; i--) {
point[i] ^= point[i - 1];
}
point[0] ^= t;
// Undo excess work
q = 2;
while (q != m) {
p = q - 1;
for (int i = 2; i >= 0; i--) {
if (point[i] & q) {
point[0] ^= p;
} else {
t = (point[0] ^ point[i]) & p;
point[0] ^= t;
point[i] ^= t;
}
}
q <<= 1;
}
x[thread_id] = point[0];
y[thread_id] = point[1];
z[thread_id] = point[2];
}
|