Darkknight535
/

kcpp-compiled-cuda-linux

Model card Files Files and versions Community

kcpp-compiled-cuda-linux / ggml /src /ggml-opencl /kernels /ggml-opencl_gemv_noshuffle_general.cl

Darkknight535

Upload folder using huggingface_hub

1d30d42 verified about 1 month ago

raw

history blame contribute delete

16.1 kB

	#pragma OPENCL EXTENSION cl_khr_fp16 : enable
	#pragma OPENCL EXTENSION cl_khr_subgroups : enable

	#ifdef cl_qcom_reqd_sub_group_size
	#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
	#define ADRENO_GPU 1
	#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
	#endif

	// assume
	#define QK4_0 32
	#define N_SIMDGROUP 4

	#define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \
	float shared_y; \
	shared_y = sub_group_broadcast(y.s0, 0); \
	total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s1, 0); \
	total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s2, 0); \
	total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s3, 0); \
	total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s4, 0); \
	total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s5, 0); \
	total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s6, 0); \
	total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s7, 0); \
	total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s0, 1); \
	total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s1, 1); \
	total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s2, 1); \
	total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s3, 1); \
	total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s4, 1); \
	total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s5, 1); \
	total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s6, 1); \
	total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s7, 1); \
	total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \


	#define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \
	shared_y = sub_group_broadcast(y.s0, 2); \
	total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s1, 2); \
	total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s2, 2); \
	total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s3, 2); \
	total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s4, 2); \
	total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s5, 2); \
	total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s6, 2); \
	total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s7, 2); \
	total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s0, 3); \
	total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s1, 3); \
	total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s2, 3); \
	total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s3, 3); \
	total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s4, 3); \
	total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s5, 3); \
	total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s6, 3); \
	total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \
	shared_y = sub_group_broadcast(y.s7, 3); \
	total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \
	total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \


	#define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \
	float8 shared_y; \
	shared_y = sub_group_broadcast(y, 0); \
	total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
	total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
	total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
	total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
	total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
	total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
	total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
	total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
	total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
	total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
	total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
	total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
	total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
	total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
	total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
	total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
	shared_y = sub_group_broadcast(y, 1); \
	total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
	total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
	total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
	total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
	total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
	total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
	total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
	total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
	total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
	total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
	total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
	total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
	total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
	total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
	total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
	total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \


	#define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \
	shared_y = sub_group_broadcast(y, 2); \
	total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
	total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
	total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
	total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
	total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
	total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
	total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
	total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
	total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
	total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
	total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
	total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
	total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
	total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
	total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
	total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \
	shared_y = sub_group_broadcast(y, 3); \
	total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \
	total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \
	total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \
	total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \
	total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \
	total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \
	total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \
	total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \
	total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \
	total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \
	total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \
	total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \
	total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \
	total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \
	total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \
	total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \

	#ifdef ADRENO_GPU
	REQD_SUBGROUP_SIZE_64
	#endif
	__kernel void kernel_gemv_noshuffle(
	__read_only image1d_buffer_t src0_q, // quantized A
	global half2 * src0_d, // A scales
	__read_only image1d_buffer_t src1, // B
	ulong offset1, // offset to B (0)
	global float * dst, // C
	ulong offsetd, // offset to C (0)
	int ne00, // K
	int ne01, // M
	int ne02, // 1
	int ne10, // K
	int ne12, // 1
	int ne0, // M
	int ne1, // N
	int r2, // 1
	int r3)
	{
	uint groupId = get_local_id(1);
	uint gid = get_global_id(0);
	ushort slid = get_sub_group_local_id();

	uint K = ne00;
	uint M = ne01;

	uint LINE_STRIDE_A = M / 2;
	uint BLOCK_STRIDE_A = N_SIMDGROUP * M;

	__private uint4 regA;
	__private half2 regS;
	__private float8 regB;

	__private float2 totalSum = (float2)(0.0f);

	// loop along K in block granularity, skip 4 blocks every iter
	for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) {
	regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows
	// first 4 fibers in each wave load 8 B values to its private scope
	if (slid < 4) {
	regB.s0123 = read_imagef(src1, (slid * 2 + k * 8));
	regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8));
	}

	// load half weights for two blocks in consecutive rows
	regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x;
	regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x;
	regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x;
	regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x;
	#ifdef VECTOR_SUB_GROUP_BROADCAT
	dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB);
	#else
	dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB);
	#endif // VECTOR_SUB_GROUP_BROADCAT

	regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x;
	regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x;
	regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x;
	regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x;
	#ifdef VECTOR_SUB_GROUP_BROADCAT
	dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB);
	#else
	dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB);
	#endif // VECTOR_SUB_GROUP_BROADCAT
	}

	// reduction in local memory, assumes #wave=4
	__local float2 reduceLM[SIMDGROUP_WIDTH * 3];
	if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum;
	if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum;
	if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum;
	barrier(CLK_LOCAL_MEM_FENCE);
	if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid];
	if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid];
	if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid];

	// 2 outputs per fiber in wave 0
	if (groupId == 0) {
	dst = (global float)((global char)dst + offsetd);
	vstore2(totalSum, 0, &(dst[gid * 2]));
	}

	}