File size: 1,785 Bytes
57e3690
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#version 450

#include "types.comp"

#extension GL_EXT_shader_16bit_storage : require

layout(push_constant) uniform parameter {
    uint IW; uint IH;
    uint OW; uint OH;
    uint OC;
    uint pelements;
    uint op;
    int k0; int k1;
    int s0; int s1;
    int p0; int p1;
} p;

#define BLOCK_SIZE 512
#define FLT_MAX 3.402823466e+38F
#define OP_POOL_MAX 0u
#define OP_POOL_AVG 1u

layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;

layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};

void main() {
    const uint idx = gl_GlobalInvocationID.x;
    if (idx >= p.pelements) {
        return;
    }

    const uint O_HW = p.OW * p.OH;

    const uint nc = idx / O_HW;
    const uint cur_oh = (idx % O_HW) / p.OW;
    const uint cur_ow = (idx % O_HW) % p.OW;

    const int start_h = int(cur_oh) * p.s0 - p.p0;
    const uint bh = max(start_h, 0);
    const uint eh = min(start_h + p.k0, p.IH);

    const int start_w = int(cur_ow) * p.s1 - p.p1;
    const uint bw = max(start_w, 0);
    const uint ew = min(start_w + p.k1, p.IW);

    const float scale = 1.0 / float(p.k0 * p.k1);
    float res;

    if (p.op == OP_POOL_AVG) {
        res = 0.0;
    } else if (p.op == OP_POOL_MAX) {
        res = -FLT_MAX;
    } else {
        return;
    }

    #pragma unroll
    for (uint i = bh; i < eh; i++) {
        #pragma unroll
        for (uint j = bw; j < ew; j++) {
            const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);

            if (p.op == OP_POOL_AVG) {
                res += cur * scale;
            } else if (p.op == OP_POOL_MAX) {
                res = max(res, cur);
            }
        }
    }

    data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
}