File size: 2,166 Bytes
b664585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_control_flow_attributes : require

layout (push_constant) uniform parameter
{
    uint ne;
    uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
    uint d_offset;
    float param1; float param2; int param3;
} p;

layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};

// true if src0/src1 are the same shape and the indices can be reused without additional modulus
layout(constant_id = 0) const bool norepeat = false;

uint get_idx() {
    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}

// mod and div are expensive and coordinates/dimensions are often power of 2 or equal to 1
uint fastmod(uint a, uint b) {
    if ((b & (b-1)) == 0) {
        return a & (b-1);
    }
    return a % b;
}

uint fastdiv(uint a, uint b) {
    return (a < b) ? 0 : (a / b);
}

void get_indices(uint idx, out uint i00, out uint i01, out uint i02, out uint i03) {
    i03 = fastdiv(idx, (p.ne02*p.ne01*p.ne00));
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
    i02 = fastdiv((idx - i03_offset), (p.ne01*p.ne00));
    const uint i02_offset = i02*p.ne01*p.ne00;
    i01 = (idx - i03_offset - i02_offset) / p.ne00;
    i00 = idx - i03_offset - i02_offset - i01*p.ne00;
}

uint src0_idx(uint i00, uint i01, uint i02, uint i03) {
    return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
}

uint src1_idx(uint i00, uint i01, uint i02, uint i03) {
    if (norepeat) {
        return i03*p.nb13 + i02*p.nb12 + i01*p.nb11 + i00*p.nb10;
    } else {
        return fastmod(i03, p.ne13)*p.nb13 + fastmod(i02, p.ne12)*p.nb12 + fastmod(i01, p.ne11)*p.nb11 + fastmod(i00, p.ne10)*p.nb10;
    }
}

uint dst_idx(uint i00, uint i01, uint i02, uint i03) {
    return i03*p.nb23 + i02*p.nb22 + i01*p.nb21 + i00*p.nb20;
}