|
#define CL_TARGET_OPENCL_VERSION 220 |
|
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS |
|
|
|
|
|
#pragma GCC diagnostic ignored "-Woverlength-strings" |
|
#ifdef __clang__ |
|
#pragma GCC diagnostic ignored "-Wgnu-anonymous-struct" |
|
#endif |
|
|
|
#include "ggml-opencl.h" |
|
#include "ggml-backend.h" |
|
#include "ggml-impl.h" |
|
#include "ggml-backend-impl.h" |
|
#include "ggml.h" |
|
|
|
#include <CL/cl.h> |
|
|
|
#include <string.h> |
|
|
|
#include <cstddef> |
|
#include <cstdint> |
|
#include <atomic> |
|
#include <fstream> |
|
#include <limits> |
|
#include <vector> |
|
#include <string> |
|
#include <cmath> |
|
|
|
#undef MIN |
|
#undef MAX |
|
#define MIN(a, b) ((a) < (b) ? (a) : (b)) |
|
#define MAX(a, b) ((a) > (b) ? (a) : (b)) |
|
|
|
#define UNUSED(x) (void)(x) |
|
|
|
#define CL_CHECK(err) \ |
|
do { \ |
|
cl_int err_ = (err); \ |
|
if (err_ != CL_SUCCESS) { \ |
|
GGML_LOG_ERROR("ggml_opencl: %s error %d at %s:%d\n", \ |
|
#err, err_, __FILE__, __LINE__); \ |
|
GGML_ASSERT(0); \ |
|
} \ |
|
} while (0) |
|
|
|
|
|
|
|
|
|
|
|
bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor); |
|
|
|
enum GPU_FAMILY { |
|
ADRENO, |
|
INTEL, |
|
UNKNOWN, |
|
}; |
|
|
|
enum ADRENO_GPU_GEN { |
|
ADRENO_UNKNOWN, |
|
A7X, |
|
A8X, |
|
X1E, |
|
}; |
|
|
|
static ADRENO_GPU_GEN get_adreno_gpu_gen(const char *device_name) { |
|
if (strstr(device_name, "730") || |
|
strstr(device_name, "740") || |
|
strstr(device_name, "750")) { |
|
return ADRENO_GPU_GEN::A7X; |
|
} |
|
|
|
if (strstr(device_name, "830")) { |
|
return ADRENO_GPU_GEN::A8X; |
|
} |
|
|
|
if (strstr(device_name, "X1")) { |
|
return ADRENO_GPU_GEN::X1E; |
|
} |
|
|
|
return ADRENO_GPU_GEN::ADRENO_UNKNOWN; |
|
} |
|
|
|
static int get_adreno_cl_compiler_version(const char *driver_version) { |
|
std::string driver_ver_str(driver_version); |
|
size_t compiler_ver_pos = driver_ver_str.find("E031"); |
|
size_t compiler_ver_len = 13; |
|
size_t compiler_ver_offset = 5; |
|
|
|
if (compiler_ver_pos == std::string::npos) { |
|
compiler_ver_pos = driver_ver_str.find("DX"); |
|
if (compiler_ver_pos == std::string::npos) { |
|
return -1; |
|
} |
|
compiler_ver_len = 11; |
|
compiler_ver_offset = 3; |
|
} |
|
|
|
std::string compiler_ver_str = driver_ver_str.substr(compiler_ver_pos, compiler_ver_len); |
|
std::string major_ver_str = compiler_ver_str.substr(compiler_ver_offset, 2); |
|
return std::atoi(major_ver_str.c_str()); |
|
} |
|
|
|
|
|
struct ggml_backend_opencl_device_context { |
|
cl_platform_id platform; |
|
std::string platform_name; |
|
|
|
cl_device_id device; |
|
std::string device_name; |
|
}; |
|
|
|
|
|
struct ggml_backend_opencl_context { |
|
cl_device_id device; |
|
std::string device_name; |
|
|
|
std::string driver_version; |
|
|
|
GPU_FAMILY gpu_family; |
|
ADRENO_GPU_GEN adreno_gen; |
|
|
|
cl_int alignment; |
|
size_t max_alloc_size; |
|
bool fp16_support; |
|
|
|
int adreno_wave_size; |
|
|
|
cl_context context; |
|
cl_command_queue queue; |
|
|
|
cl_program program; |
|
cl_program program_1; |
|
cl_program program_2; |
|
|
|
cl_kernel kernel_add, kernel_add_row; |
|
cl_kernel kernel_mul, kernel_mul_row; |
|
cl_kernel kernel_scale; |
|
cl_kernel kernel_silu, kernel_silu_4; |
|
cl_kernel kernel_gelu, kernel_gelu_4; |
|
cl_kernel kernel_relu; |
|
cl_kernel kernel_clamp; |
|
cl_kernel kernel_norm; |
|
cl_kernel kernel_rms_norm; |
|
cl_kernel kernel_diag_mask_inf, kernel_diag_mask_inf_8; |
|
cl_kernel kernel_soft_max, kernel_soft_max_4; |
|
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0; |
|
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16; |
|
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32; |
|
cl_kernel kernel_mul_mat_f32_f32; |
|
cl_kernel kernel_mul_mat_f16_f16; |
|
cl_kernel kernel_mul_mat_f16_f32_1row; |
|
cl_kernel kernel_mul_mat_f16_f32; |
|
cl_kernel kernel_mul_mat_f16_f32_l4; |
|
cl_kernel kernel_mul_mat_q4_0_f32, kernel_mul_mat_q4_0_f32_v; |
|
cl_kernel kernel_convert_block_q4_0, kernel_restore_block_q4_0, kernel_mul_mat_q4_0_f32_flat; |
|
cl_kernel kernel_mul_mat_q4_0_f32_8x_flat; |
|
cl_kernel kernel_convert_block_q4_0_noshuffle, kernel_mul_mat_q4_0_f32_flat_v0, |
|
kernel_mul_mat_q4_0_f32_flat_img_v0; |
|
cl_kernel kernel_mul_mat_q4_0_f32_1d_8x_flat, kernel_mul_mat_q4_0_f32_1d_16x_flat; |
|
cl_kernel kernel_mul_mv_q6_K_f32; |
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
|
|
cl_program program_transpose_32; |
|
cl_program program_transpose_32_16; |
|
cl_program program_transpose_16; |
|
cl_kernel kernel_transpose_32; |
|
cl_kernel kernel_transpose_32_16; |
|
cl_kernel kernel_transpose_16; |
|
|
|
cl_mem A_s_d_max; |
|
cl_mem A_q_d_max; |
|
cl_mem B_d_max; |
|
|
|
|
|
cl_program program_CL_gemm; |
|
cl_program program_CL_gemv_general; |
|
cl_program program_CL_gemv_4096_1_11008; |
|
cl_program program_CL_gemv_4096_1_4096; |
|
cl_program program_CL_gemv_11008_1_4096; |
|
cl_program program_CL_gemv_32000_1_4096; |
|
cl_kernel CL_mul_mat_Ab_Bi_8x4; |
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general; |
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008; |
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; |
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; |
|
cl_kernel CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; |
|
#endif |
|
}; |
|
|
|
static ggml_backend_device g_ggml_backend_opencl_device; |
|
static ggml_backend_opencl_device_context g_ggml_ctx_dev_main { |
|
nullptr, |
|
"", |
|
nullptr, |
|
"", |
|
}; |
|
|
|
static int ggml_backend_opencl_n_devices = 0; |
|
|
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
struct ProfilingInfo { |
|
std::string op_name; |
|
std::string kernel_name; |
|
|
|
cl_ulong duration_ns; |
|
|
|
size_t global_size[3]; |
|
size_t local_size[3]; |
|
|
|
size_t output_size[4]; |
|
}; |
|
|
|
std::vector<ProfilingInfo> g_profiling_info; |
|
#endif |
|
|
|
inline std::string read_file(const std::string &path) { |
|
std::ifstream ifs(path); |
|
if (!ifs) { |
|
return ""; |
|
} |
|
std::string text; |
|
ifs.seekg(0, std::ios::end); |
|
text.resize(ifs.tellg()); |
|
ifs.seekg(0, std::ios::beg); |
|
ifs.read(&text[0], text.size()); |
|
return text; |
|
} |
|
|
|
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer, const std::string &compile_opts) { |
|
cl_program p; |
|
char *program_log; |
|
size_t program_size; |
|
size_t log_size; |
|
int err; |
|
|
|
program_size = strlen(program_buffer); |
|
|
|
p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); |
|
if(err < 0) { |
|
GGML_LOG_ERROR("OpenCL error creating program"); |
|
exit(1); |
|
} |
|
|
|
err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); |
|
if(err < 0) { |
|
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); |
|
program_log = (char*) malloc(log_size + 1); |
|
program_log[log_size] = '\0'; |
|
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); |
|
GGML_LOG_ERROR("ggml_opencl: kernel compile error:\n\n%s\n", program_log); |
|
free(program_log); |
|
exit(1); |
|
} |
|
|
|
return p; |
|
} |
|
|
|
static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) { |
|
static bool initialized = false; |
|
static ggml_backend_opencl_context *backend_ctx = nullptr; |
|
|
|
if (initialized) { |
|
return backend_ctx; |
|
} |
|
|
|
ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *)dev->context; |
|
GGML_ASSERT(dev_ctx); |
|
GGML_ASSERT(dev_ctx->platform == nullptr); |
|
GGML_ASSERT(dev_ctx->device == nullptr); |
|
GGML_ASSERT(backend_ctx == nullptr); |
|
|
|
initialized = true; |
|
backend_ctx = new ggml_backend_opencl_context(); |
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; |
|
|
|
cl_int err; |
|
|
|
#ifdef GGML_PROFILE_OPENCL |
|
GGML_LOG_INFO("ggml_opencl: OpenCL profiling enabled\n"); |
|
#endif |
|
|
|
struct cl_device; |
|
struct cl_platform { |
|
cl_platform_id id; |
|
unsigned number; |
|
char name[128]; |
|
char vendor[128]; |
|
struct cl_device * devices; |
|
unsigned n_devices; |
|
struct cl_device * default_device; |
|
}; |
|
|
|
struct cl_device { |
|
struct cl_platform * platform; |
|
cl_device_id id; |
|
unsigned number; |
|
cl_device_type type; |
|
char name[128]; |
|
}; |
|
|
|
enum { NPLAT = 16, NDEV = 16 }; |
|
|
|
struct cl_platform platforms[NPLAT]; |
|
unsigned n_platforms = 0; |
|
struct cl_device devices[NDEV]; |
|
unsigned n_devices = 0; |
|
struct cl_device * default_device = NULL; |
|
|
|
cl_platform_id platform_ids[NPLAT]; |
|
if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) { |
|
GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n"); |
|
return backend_ctx; |
|
} |
|
|
|
for (unsigned i = 0; i < n_platforms; i++) { |
|
struct cl_platform * p = &platforms[i]; |
|
p->number = i; |
|
p->id = platform_ids[i]; |
|
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); |
|
CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); |
|
|
|
cl_device_id device_ids[NDEV]; |
|
cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); |
|
if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { |
|
p->n_devices = 0; |
|
} else { |
|
CL_CHECK(clGetDeviceIDsError); |
|
} |
|
p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; |
|
p->default_device = NULL; |
|
|
|
for (unsigned j = 0; j < p->n_devices; j++) { |
|
struct cl_device * d = &devices[n_devices]; |
|
d->number = n_devices++; |
|
d->id = device_ids[j]; |
|
d->platform = p; |
|
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); |
|
CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); |
|
|
|
if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { |
|
p->default_device = d; |
|
} |
|
} |
|
|
|
if (default_device == NULL && p->default_device != NULL) { |
|
default_device = p->default_device; |
|
} |
|
} |
|
|
|
if (n_devices == 0) { |
|
GGML_LOG_ERROR("ggml_opencl: could find any OpenCL devices.\n"); |
|
return backend_ctx; |
|
} |
|
|
|
char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); |
|
char * user_device_string = getenv("GGML_OPENCL_DEVICE"); |
|
int user_platform_number = -1; |
|
int user_device_number = -1; |
|
|
|
unsigned n; |
|
if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { |
|
user_platform_number = (int)n; |
|
} |
|
if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { |
|
user_device_number = (int)n; |
|
} |
|
if (user_platform_number != -1 && user_device_number != -1) { |
|
cl_platform* platform = &platforms[user_platform_number]; |
|
if ((unsigned)user_device_number >= platform->n_devices) { |
|
GGML_LOG_ERROR("ggml_opencl: invalid device number %d\n", user_device_number); |
|
exit(1); |
|
} |
|
default_device = &platform->devices[user_device_number]; |
|
} else { |
|
|
|
struct cl_device * selected_devices = devices; |
|
unsigned n_selected_devices = n_devices; |
|
|
|
if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { |
|
for (unsigned i = 0; i < n_platforms; i++) { |
|
struct cl_platform * p = &platforms[i]; |
|
if (strstr(p->name, user_platform_string) != NULL || |
|
strstr(p->vendor, user_platform_string) != NULL) { |
|
user_platform_number = (int)i; |
|
break; |
|
} |
|
} |
|
if (user_platform_number == -1) { |
|
GGML_LOG_ERROR("ggml_opencl: no platform matching '%s' was found.\n", user_platform_string); |
|
exit(1); |
|
} |
|
} |
|
if (user_platform_number != -1) { |
|
struct cl_platform * p = &platforms[user_platform_number]; |
|
selected_devices = p->devices; |
|
n_selected_devices = p->n_devices; |
|
default_device = p->default_device; |
|
if (n_selected_devices == 0) { |
|
GGML_LOG_ERROR("ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); |
|
exit(1); |
|
} |
|
} |
|
|
|
if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { |
|
for (unsigned i = 0; i < n_selected_devices; i++) { |
|
struct cl_device * d = &selected_devices[i]; |
|
if (strstr(d->name, user_device_string) != NULL) { |
|
user_device_number = d->number; |
|
break; |
|
} |
|
} |
|
if (user_device_number == -1) { |
|
GGML_LOG_ERROR("ggml_opencl: no device matching '%s' was found.\n", user_device_string); |
|
exit(1); |
|
} |
|
} |
|
if (user_device_number != -1) { |
|
selected_devices = &devices[user_device_number]; |
|
n_selected_devices = 1; |
|
default_device = &selected_devices[0]; |
|
} |
|
|
|
GGML_ASSERT(n_selected_devices > 0); |
|
|
|
if (default_device == NULL) { |
|
default_device = &selected_devices[0]; |
|
} |
|
} |
|
|
|
GGML_LOG_INFO("ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); |
|
GGML_LOG_INFO("ggml_opencl: selecting device: '%s'\n", default_device->name); |
|
if (default_device->type != CL_DEVICE_TYPE_GPU) { |
|
GGML_LOG_WARN("ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); |
|
} |
|
|
|
dev_ctx->platform = default_device->platform->id; |
|
dev_ctx->device = default_device->id; |
|
backend_ctx->device = default_device->id; |
|
|
|
if (strstr(default_device->name, "Adreno")) { |
|
backend_ctx->gpu_family = GPU_FAMILY::ADRENO; |
|
backend_ctx->adreno_gen = get_adreno_gpu_gen(default_device->name); |
|
|
|
|
|
if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) { |
|
backend_ctx->adreno_wave_size = 64; |
|
} else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X || |
|
backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) { |
|
backend_ctx->adreno_wave_size = 128; |
|
} else { |
|
backend_ctx->adreno_wave_size = 128; |
|
GGML_LOG_WARN("ggml_opencl: Unsupported Adreno GPU: %s, " |
|
"using wave size %d, " |
|
"may not work as expected\n", |
|
backend_ctx->device_name.c_str(), backend_ctx->adreno_wave_size); |
|
} |
|
} else if (strstr(default_device->name, "Intel")) { |
|
backend_ctx->gpu_family = GPU_FAMILY::INTEL; |
|
} else { |
|
GGML_LOG_ERROR("Unsupported GPU: %s\n", default_device->name); |
|
backend_ctx->gpu_family = GPU_FAMILY::UNKNOWN; |
|
return backend_ctx; |
|
} |
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
if (backend_ctx->gpu_family != GPU_FAMILY::ADRENO) { |
|
GGML_LOG_ERROR("ggml_opencl: Adreno-specific kernels should not be enabled for non-Adreno GPUs; " |
|
"run on an Adreno GPU or recompile with CMake option `-DGGML_OPENCL_USE_ADRENO_KERNELS=OFF`\n"); |
|
return backend_ctx; |
|
} |
|
#endif |
|
|
|
|
|
dev_ctx->platform_name = default_device->platform->name; |
|
dev_ctx->device_name = default_device->name; |
|
backend_ctx->device_name = default_device->name; |
|
|
|
|
|
cl_device_id device = backend_ctx->device; |
|
|
|
|
|
size_t device_ver_str_size; |
|
clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, &device_ver_str_size); |
|
char *device_ver_buffer = (char *)alloca(device_ver_str_size + 1); |
|
clGetDeviceInfo(device, CL_DEVICE_VERSION, device_ver_str_size, device_ver_buffer, NULL); |
|
device_ver_buffer[device_ver_str_size] = '\0'; |
|
GGML_LOG_INFO("ggml_opencl: device OpenCL version: %s\n", device_ver_buffer); |
|
|
|
if (strstr(device_ver_buffer, "OpenCL 2") == NULL && |
|
strstr(device_ver_buffer, "OpenCL 3") == NULL) { |
|
GGML_LOG_ERROR("ggml_opencl: OpenCL 2.0 or above is required\n"); |
|
return backend_ctx; |
|
} |
|
|
|
|
|
size_t driver_version_str_size; |
|
clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, NULL, &driver_version_str_size); |
|
char *driver_version = (char *)alloca(driver_version_str_size + 1); |
|
clGetDeviceInfo(device, CL_DRIVER_VERSION, driver_version_str_size, driver_version, NULL); |
|
driver_version[driver_version_str_size] = '\0'; |
|
GGML_LOG_INFO("ggml_opencl: OpenCL driver: %s\n", driver_version); |
|
backend_ctx->driver_version = driver_version; |
|
|
|
int adreno_cl_compiler_version = get_adreno_cl_compiler_version(driver_version); |
|
bool has_vector_subgroup_broadcast = |
|
adreno_cl_compiler_version >= 47 || adreno_cl_compiler_version == 17; |
|
GGML_LOG_INFO("ggml_opencl: vector subgroup broadcast support: %s\n", |
|
has_vector_subgroup_broadcast ? "true" : "false"); |
|
|
|
size_t ext_str_size; |
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); |
|
char *ext_buffer = (char *)alloca(ext_str_size + 1); |
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); |
|
ext_buffer[ext_str_size] = '\0'; |
|
|
|
backend_ctx->fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; |
|
GGML_LOG_INFO("ggml_opencl: device FP16 support: %s\n", backend_ctx->fp16_support ? "true" : "false"); |
|
|
|
|
|
if (!backend_ctx->fp16_support) { |
|
GGML_LOG_ERROR("ggml_opencl: device does not support FP16\n"); |
|
return backend_ctx; |
|
} |
|
|
|
|
|
|
|
if (strstr(device_ver_buffer, "OpenCL 3") && |
|
strstr(ext_buffer, "cl_khr_subgroups") == NULL && |
|
strstr(ext_buffer, "cl_intel_subgroups") == NULL) { |
|
GGML_LOG_ERROR("ggml_opencl: device does not support subgroups (cl_khr_subgroups or cl_intel_subgroups) " |
|
"(note that subgroups is an optional feature in OpenCL 3.0)\n"); |
|
return backend_ctx; |
|
} |
|
|
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &backend_ctx->alignment, NULL)); |
|
GGML_LOG_INFO("ggml_opencl: mem base addr align: %u\n", backend_ctx->alignment); |
|
|
|
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &backend_ctx->max_alloc_size, NULL); |
|
GGML_LOG_INFO("ggml_opencl: max mem alloc size: %zu MB\n", backend_ctx->max_alloc_size/1024/1024); |
|
|
|
|
|
cl_device_svm_capabilities svm_caps; |
|
CL_CHECK(clGetDeviceInfo(device, CL_DEVICE_SVM_CAPABILITIES, sizeof(cl_device_svm_capabilities), &svm_caps, 0)); |
|
GGML_LOG_INFO("ggml_opencl: SVM coarse grain buffer support: %s\n", |
|
svm_caps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER ? "true" : "false"); |
|
GGML_LOG_INFO("ggml_opencl: SVM fine grain buffer support: %s\n", |
|
svm_caps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER ? "true" : "false"); |
|
GGML_LOG_INFO("ggml_opencl: SVM fine grain system support: %s\n", |
|
svm_caps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM ? "true" : "false"); |
|
GGML_LOG_INFO("ggml_opencl: SVM atomics support: %s\n", |
|
svm_caps & CL_DEVICE_SVM_ATOMICS ? "true" : "false"); |
|
|
|
|
|
#ifdef GGML_OPENCL_SOA_Q |
|
GGML_LOG_INFO("ggml_opencl: flattening quantized weights representation as struct of arrays (GGML_OPENCL_SOA_Q)\n"); |
|
#endif |
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
GGML_LOG_INFO("ggml_opencl: using kernels optimized for Adreno (GGML_OPENCL_USE_ADRENO_KERNELS)\n"); |
|
#endif |
|
|
|
cl_context_properties properties[] = { |
|
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)dev_ctx->platform, 0 |
|
}; |
|
|
|
CL_CHECK((backend_ctx->context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); |
|
|
|
|
|
cl_context context = backend_ctx->context; |
|
|
|
|
|
|
|
|
|
|
|
cl_command_queue_properties command_queue_props = 0; |
|
#ifdef GGML_OPENCL_PROFILING |
|
command_queue_props |= CL_QUEUE_PROFILING_ENABLE; |
|
#endif |
|
CL_CHECK((backend_ctx->queue = clCreateCommandQueue(context, device, command_queue_props, &err), err)); |
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string kernel_src { |
|
#include "ggml-opencl.cl.h" |
|
}; |
|
#else |
|
const std::string kernel_src = read_file("ggml-opencl.cl"); |
|
#endif |
|
|
|
std::string compile_opts = |
|
"-cl-std=CL2.0 -cl-mad-enable -cl-unsafe-math-optimizations " |
|
"-cl-finite-math-only -cl-fast-relaxed-math "; |
|
backend_ctx->program = build_program_from_source(context, device, kernel_src.c_str(), compile_opts); |
|
|
|
|
|
CL_CHECK((backend_ctx->kernel_get_rows_f32 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_get_rows_f16 = clCreateKernel(backend_ctx->program, "kernel_get_rows_f16", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_get_rows_q4_0 = clCreateKernel(backend_ctx->program, "kernel_get_rows_q4_0", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_add = clCreateKernel(backend_ctx->program, "kernel_add", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_add_row = clCreateKernel(backend_ctx->program, "kernel_add_row", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul = clCreateKernel(backend_ctx->program, "kernel_mul", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_row = clCreateKernel(backend_ctx->program, "kernel_mul_row", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_scale = clCreateKernel(backend_ctx->program, "kernel_scale", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_silu = clCreateKernel(backend_ctx->program, "kernel_silu", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_silu_4 = clCreateKernel(backend_ctx->program, "kernel_silu_4", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_gelu = clCreateKernel(backend_ctx->program, "kernel_gelu", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_gelu_4 = clCreateKernel(backend_ctx->program, "kernel_gelu_4", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_relu = clCreateKernel(backend_ctx->program, "kernel_relu", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_clamp = clCreateKernel(backend_ctx->program, "kernel_clamp", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_norm = clCreateKernel(backend_ctx->program, "kernel_norm", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_rms_norm = clCreateKernel(backend_ctx->program, "kernel_rms_norm", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_diag_mask_inf = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_diag_mask_inf_8 = clCreateKernel(backend_ctx->program, "kernel_diag_mask_inf_8", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_soft_max = clCreateKernel(backend_ctx->program, "kernel_soft_max", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_soft_max_4 = clCreateKernel(backend_ctx->program, "kernel_soft_max_4", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_rope_norm_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_rope_norm_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_norm_f16", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_rope_neox_f32 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_rope_neox_f16 = clCreateKernel(backend_ctx->program, "kernel_rope_neox_f16", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_cpy_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f16", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_cpy_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f16_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_cpy_f32_f16 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f16", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_cpy_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_cpy_f32_f32", &err), err)); |
|
|
|
|
|
CL_CHECK((backend_ctx->kernel_mul_mat_f32_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f32_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f16 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f16", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_1row = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_1row", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_f16_f32_l4", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32 = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_v = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_v", &err), err)); |
|
|
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_flat", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_convert_block_q4_0", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_restore_block_q4_0 = clCreateKernel(backend_ctx->program, "kernel_restore_block_q4_0", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program, "kernel_mul_mat_q4_0_f32_8x_flat", &err), err)); |
|
|
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string kernel_src_1 { |
|
#include "ggml-opencl_mm.cl.h" |
|
}; |
|
#else |
|
const std::string kernel_src_1 = read_file("ggml-opencl_mm.cl"); |
|
#endif |
|
backend_ctx->program_1 = build_program_from_source(context, device, kernel_src_1.c_str(), compile_opts); |
|
|
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_8x_flat", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_1d_16x_flat", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mv_q6_K_f32 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mv_q6_K_f32", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_v0", &err), err)); |
|
CL_CHECK((backend_ctx->kernel_mul_mat_q4_0_f32_flat_img_v0 = clCreateKernel(backend_ctx->program_1, "kernel_mul_mat_q4_0_f32_flat_img_v0", &err), err)); |
|
|
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string kernel_src_2 { |
|
#include "ggml-opencl_cvt.cl.h" |
|
}; |
|
#else |
|
const std::string kernel_src_2 = read_file("ggml-opencl_cvt.cl"); |
|
#endif |
|
backend_ctx->program_2 = build_program_from_source(context, device, kernel_src_2.c_str(), compile_opts); |
|
|
|
CL_CHECK((backend_ctx->kernel_convert_block_q4_0_noshuffle = clCreateKernel(backend_ctx->program_2, "kernel_convert_block_q4_0_noshuffle", &err), err)); |
|
|
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string transpose_32_src { |
|
#include "ggml-opencl_transpose_32.cl.h" |
|
}; |
|
#else |
|
const std::string transpose_32_src = read_file("ggml-opencl_transpose_32.cl"); |
|
#endif |
|
backend_ctx->program_transpose_32 = build_program_from_source(context, device, transpose_32_src.c_str(), compile_opts); |
|
CL_CHECK((backend_ctx->kernel_transpose_32 = clCreateKernel(backend_ctx->program_transpose_32, "kernel_transpose_32", &err), err)); |
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string transpose_32_16_src { |
|
#include "ggml-opencl_transpose_32_16.cl.h" |
|
}; |
|
#else |
|
const std::string transpose_32_16_src = read_file("ggml-opencl_transpose_32_16.cl"); |
|
#endif |
|
backend_ctx->program_transpose_32_16 = build_program_from_source(context, device, transpose_32_16_src.c_str(), compile_opts); |
|
CL_CHECK((backend_ctx->kernel_transpose_32_16 = clCreateKernel(backend_ctx->program_transpose_32_16, "kernel_transpose_32_16", &err), err)); |
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string transpose_16_src { |
|
#include "ggml-opencl_transpose_16.cl.h" |
|
}; |
|
#else |
|
const std::string transpose_16_src = read_file("ggml-opencl_transpose_16.cl"); |
|
#endif |
|
backend_ctx->program_transpose_16 = build_program_from_source(context, device, transpose_16_src.c_str(), compile_opts); |
|
CL_CHECK((backend_ctx->kernel_transpose_16 = clCreateKernel(backend_ctx->program_transpose_16, "kernel_transpose_16", &err), err)); |
|
|
|
|
|
std::string CL_gemv_compile_opts = |
|
" -cl-std=CL2.0 " |
|
" -cl-mad-enable " |
|
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); |
|
if (has_vector_subgroup_broadcast) { |
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
|
} |
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string kernel_src_CL_gemv_general { |
|
#include "ggml-opencl_gemv_noshuffle_general.cl.h" |
|
}; |
|
#else |
|
const std::string kernel_src_CL_gemv_general = read_file("ggml-opencl_gemv_noshuffle_general.cl"); |
|
#endif |
|
|
|
backend_ctx->program_CL_gemv_general = build_program_from_source( |
|
context, device, kernel_src_CL_gemv_general.c_str(), CL_gemv_compile_opts); |
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general = clCreateKernel(backend_ctx->program_CL_gemv_general, "kernel_gemv_noshuffle", &err), err)); |
|
|
|
|
|
CL_gemv_compile_opts = |
|
" -cl-std=CL2.0 " |
|
" -cl-mad-enable " |
|
" -DLINE_STRIDE_A=2048 " |
|
" -DBLOCK_STRIDE_A=16384 " |
|
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); |
|
if (has_vector_subgroup_broadcast) { |
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
|
} |
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string kernel_src_CL_gemv { |
|
#include "ggml-opencl_gemv_noshuffle.cl.h" |
|
}; |
|
#else |
|
const std::string kernel_src_CL_gemv = read_file("ggml-opencl_gemv_noshuffle.cl"); |
|
#endif |
|
|
|
backend_ctx->program_CL_gemv_4096_1_4096 = build_program_from_source( |
|
context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_4096, "kernel_gemv_noshuffle", &err), err)); |
|
|
|
|
|
CL_gemv_compile_opts = |
|
" -cl-std=CL2.0 " |
|
" -cl-mad-enable " |
|
" -DLINE_STRIDE_A=2048 " |
|
" -DBLOCK_STRIDE_A=16384 " |
|
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); |
|
if (has_vector_subgroup_broadcast) { |
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
|
} |
|
|
|
backend_ctx->program_CL_gemv_4096_1_11008 = build_program_from_source( |
|
context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008 = clCreateKernel(backend_ctx->program_CL_gemv_4096_1_11008, "kernel_gemv_noshuffle", &err), err)); |
|
|
|
|
|
CL_gemv_compile_opts = |
|
" -cl-std=CL2.0 " |
|
" -cl-mad-enable " |
|
" -DLINE_STRIDE_A=5504 " |
|
" -DBLOCK_STRIDE_A=44032 " |
|
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); |
|
if (has_vector_subgroup_broadcast) { |
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
|
} |
|
|
|
backend_ctx->program_CL_gemv_11008_1_4096 = build_program_from_source( |
|
context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_11008_1_4096, "kernel_gemv_noshuffle", &err), err)); |
|
|
|
|
|
CL_gemv_compile_opts = |
|
" -cl-std=CL2.0 " |
|
" -cl-mad-enable " |
|
" -DLINE_STRIDE_A=16000 " |
|
" -DBLOCK_STRIDE_A=128000 " |
|
" -DSIMDGROUP_WIDTH=" + std::to_string(backend_ctx->adreno_wave_size); |
|
if (has_vector_subgroup_broadcast) { |
|
CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAT "; |
|
} |
|
|
|
backend_ctx->program_CL_gemv_32000_1_4096 = build_program_from_source(context, device, kernel_src_CL_gemv.c_str(), CL_gemv_compile_opts); |
|
CL_CHECK((backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096 = clCreateKernel(backend_ctx->program_CL_gemv_32000_1_4096, "kernel_gemv_noshuffle", &err), err)); |
|
|
|
|
|
#ifdef GGML_OPENCL_EMBED_KERNELS |
|
const std::string kernel_src_CL_gemm { |
|
#include "ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h" |
|
}; |
|
#else |
|
const std::string kernel_src_CL_gemm = read_file("ggml-opencl_mul_mat_Ab_Bi_8x4.cl"); |
|
#endif |
|
backend_ctx->program_CL_gemm = build_program_from_source(context, device, kernel_src_CL_gemm.c_str(), compile_opts); |
|
CL_CHECK((backend_ctx->CL_mul_mat_Ab_Bi_8x4 = clCreateKernel(backend_ctx->program_CL_gemm, "kernel_mul_mat_Ab_Bi_8x4", &err), err)); |
|
|
|
|
|
size_t max_A_q_d_bytes = 311164928; |
|
size_t max_A_s_d_bytes = 38895616; |
|
size_t max_B_d_bytes = 45088768; |
|
|
|
CL_CHECK((backend_ctx->A_q_d_max = clCreateBuffer(context, 0, max_A_q_d_bytes, NULL, &err), err)); |
|
CL_CHECK((backend_ctx->A_s_d_max = clCreateBuffer(context, 0, max_A_s_d_bytes, NULL, &err), err)); |
|
CL_CHECK((backend_ctx->B_d_max = clCreateBuffer(context, 0, max_B_d_bytes, NULL, &err), err)); |
|
#endif |
|
|
|
|
|
ggml_backend_opencl_n_devices = 1; |
|
|
|
return backend_ctx; |
|
} |
|
|
|
static void ggml_cl2_free(void) { |
|
#ifdef GGML_OPENCL_PROFILING |
|
FILE * fperf = fopen("cl_profiling.csv", "w"); |
|
if (!fperf) { |
|
GGML_LOG_ERROR("Failed to open cl_profiling.csv\n"); |
|
return; |
|
} |
|
|
|
float total_kernel_time = 0; |
|
fprintf(fperf, "op name, kernel name, duration (ms), global size, local size, output size\n"); |
|
for (const ProfilingInfo & info : g_profiling_info) { |
|
total_kernel_time += info.duration_ns/1.e6f; |
|
fprintf(fperf, "%s,%s,%f,%zux%zux%zu,%zux%zux%zu,%zux%zux%zux%zu\n", |
|
info.op_name.c_str(), info.kernel_name.c_str(), info.duration_ns/1.e6f, |
|
info.global_size[0], info.global_size[1], info.global_size[2], |
|
info.local_size[0], info.local_size[2], info.local_size[2], |
|
info.output_size[0], info.output_size[1], info.output_size[2], info.output_size[3]); |
|
} |
|
fclose(fperf); |
|
|
|
GGML_LOG_INFO("ggml_opencl: total kernel time: %f\n", total_kernel_time); |
|
#endif |
|
} |
|
|
|
|
|
|
|
|
|
struct ggml_tensor_extra_cl { |
|
|
|
cl_mem data_device; |
|
|
|
|
|
|
|
|
|
cl_ulong offset; |
|
|
|
|
|
size_t actual_size; |
|
|
|
void reset() { |
|
data_device = nullptr; |
|
offset = 0; |
|
actual_size = 0; |
|
} |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
struct ggml_tensor_extra_cl_q4_0 { |
|
|
|
cl_mem q = nullptr; |
|
|
|
cl_mem q_img = nullptr; |
|
|
|
cl_mem d = nullptr; |
|
|
|
cl_mem d_img = nullptr; |
|
|
|
size_t size_q = 0; |
|
|
|
size_t size_d = 0; |
|
|
|
~ggml_tensor_extra_cl_q4_0() { |
|
reset(); |
|
} |
|
|
|
void reset() { |
|
|
|
|
|
|
|
if (q != nullptr) { |
|
CL_CHECK(clReleaseMemObject(q)); |
|
q = nullptr; |
|
} |
|
if (d != nullptr) { |
|
CL_CHECK(clReleaseMemObject(d)); |
|
d = nullptr; |
|
} |
|
|
|
|
|
|
|
|
|
q_img = nullptr; |
|
d_img = nullptr; |
|
size_q = 0; |
|
size_d = 0; |
|
} |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static const char * ggml_backend_opencl_name(ggml_backend_t backend) { |
|
return "OpenCL"; |
|
|
|
UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_opencl_free(ggml_backend_t backend) { |
|
ggml_cl2_free(); |
|
|
|
GGML_UNUSED(backend); |
|
} |
|
|
|
static void ggml_backend_opencl_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
|
GGML_UNUSED(backend); |
|
GGML_UNUSED(tensor); |
|
GGML_UNUSED(data); |
|
GGML_UNUSED(offset); |
|
GGML_UNUSED(size); |
|
} |
|
|
|
static void ggml_backend_opencl_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
|
GGML_UNUSED(backend); |
|
GGML_UNUSED(tensor); |
|
GGML_UNUSED(data); |
|
GGML_UNUSED(offset); |
|
GGML_UNUSED(size); |
|
} |
|
|
|
static bool ggml_backend_opencl_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { |
|
GGML_UNUSED(backend); |
|
GGML_UNUSED(src); |
|
GGML_UNUSED(dst); |
|
return false; |
|
} |
|
|
|
static void ggml_backend_opencl_synchronize(ggml_backend_t backend) { |
|
GGML_UNUSED(backend); |
|
} |
|
|
|
static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { |
|
for (int i = 0; i < cgraph->n_nodes; i++) { |
|
ggml_tensor * node = cgraph->nodes[i]; |
|
|
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { |
|
continue; |
|
} |
|
|
|
bool ok = ggml_cl_compute_forward(backend, node); |
|
if (!ok) { |
|
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); |
|
} |
|
GGML_ASSERT(ok); |
|
} |
|
|
|
return GGML_STATUS_SUCCESS; |
|
} |
|
|
|
static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
|
GGML_UNUSED(dev); |
|
|
|
switch (op->op) { |
|
case GGML_OP_NONE: |
|
return true; |
|
case GGML_OP_GET_ROWS: |
|
switch (op->src[0]->type) { |
|
case GGML_TYPE_F32: |
|
case GGML_TYPE_F16: |
|
return true; |
|
case GGML_TYPE_Q4_0: |
|
#ifdef GGML_OPENCL_SOA_Q |
|
|
|
return false; |
|
#else |
|
return true; |
|
#endif |
|
default: |
|
return false; |
|
} |
|
case GGML_OP_CPY: |
|
case GGML_OP_DUP: |
|
case GGML_OP_CONT: |
|
switch (op->src[0]->type) { |
|
case GGML_TYPE_F32: |
|
switch (op->type) { |
|
case GGML_TYPE_F16: |
|
case GGML_TYPE_F32: |
|
return true; |
|
default: |
|
return false; |
|
} |
|
case GGML_TYPE_F16: |
|
switch (op->type) { |
|
case GGML_TYPE_F16: |
|
case GGML_TYPE_F32: |
|
return true; |
|
default: |
|
return false; |
|
} |
|
default: |
|
return false; |
|
} |
|
case GGML_OP_ADD: |
|
case GGML_OP_SCALE: |
|
case GGML_OP_MUL: |
|
return true; |
|
case GGML_OP_UNARY: |
|
switch (ggml_get_unary_op(op)) { |
|
case GGML_UNARY_OP_GELU: |
|
case GGML_UNARY_OP_SILU: |
|
case GGML_UNARY_OP_RELU: |
|
return ggml_is_contiguous(op->src[0]); |
|
default: |
|
return false; |
|
} |
|
case GGML_OP_CLAMP: |
|
case GGML_OP_SOFT_MAX: |
|
case GGML_OP_NORM: |
|
case GGML_OP_RMS_NORM: |
|
return true; |
|
case GGML_OP_MUL_MAT: |
|
if (op->src[0]->type == GGML_TYPE_F16) { |
|
return true; |
|
} else if (op->src[0]->type == GGML_TYPE_F32) { |
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); |
|
} else if (op->src[0]->type == GGML_TYPE_Q4_0 || |
|
op->src[0]->type == GGML_TYPE_Q6_K) { |
|
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); |
|
} |
|
return false; |
|
case GGML_OP_RESHAPE: |
|
case GGML_OP_VIEW: |
|
case GGML_OP_PERMUTE: |
|
case GGML_OP_TRANSPOSE: |
|
return true; |
|
case GGML_OP_DIAG_MASK_INF: |
|
return op->ne[3] == 1; |
|
case GGML_OP_ROPE: |
|
return true; |
|
default: |
|
return false; |
|
} |
|
} |
|
|
|
|
|
static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type); |
|
|
|
static ggml_guid_t ggml_backend_opencl_guid() { |
|
static ggml_guid guid = { 0xde, 0xe0, 0x70, 0xa2, 0x73, 0x4e, 0x4d, 0xbc, 0xb0, 0xc7, 0x4f, 0xd4, 0x6d, 0x4e, 0x90, 0xfe }; |
|
return &guid; |
|
} |
|
|
|
static ggml_backend_i ggml_backend_opencl_i = { |
|
ggml_backend_opencl_name, |
|
ggml_backend_opencl_free, |
|
NULL, |
|
NULL, |
|
NULL, |
|
NULL, |
|
NULL, |
|
NULL, |
|
NULL, |
|
NULL, |
|
ggml_backend_opencl_graph_compute, |
|
NULL, |
|
NULL, |
|
}; |
|
|
|
ggml_backend_t ggml_backend_opencl_init(void) { |
|
ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_opencl_reg(), 0); |
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); |
|
|
|
ggml_backend_t backend = new ggml_backend { |
|
ggml_backend_opencl_guid(), |
|
ggml_backend_opencl_i, |
|
dev, |
|
backend_ctx |
|
}; |
|
|
|
return backend; |
|
} |
|
|
|
bool ggml_backend_is_opencl(ggml_backend_t backend) { |
|
return backend && backend->iface.get_name == ggml_backend_opencl_name; |
|
} |
|
|
|
|
|
|
|
|
|
struct ggml_backend_opencl_buffer_context { |
|
|
|
|
|
|
|
|
|
|
|
ggml_backend_opencl_buffer_context(cl_mem buf) |
|
: name("OpenCL") { |
|
buffer.push_back(buf); |
|
} |
|
|
|
~ggml_backend_opencl_buffer_context() { |
|
for (cl_mem buf : buffer) { |
|
CL_CHECK(clReleaseMemObject(buf)); |
|
} |
|
for (cl_mem im : img) { |
|
CL_CHECK(clReleaseMemObject(im)); |
|
} |
|
|
|
|
|
for (ggml_tensor_extra_cl * e : temp_tensor_extras) { |
|
delete e; |
|
} |
|
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) { |
|
delete e; |
|
} |
|
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0) { |
|
delete e; |
|
} |
|
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) { |
|
delete e; |
|
} |
|
} |
|
|
|
ggml_tensor_extra_cl * ggml_opencl_alloc_temp_tensor_extra() { |
|
ggml_tensor_extra_cl * extra; |
|
if (temp_tensor_extras.empty()) { |
|
extra = new ggml_tensor_extra_cl(); |
|
} else { |
|
extra = temp_tensor_extras.back(); |
|
temp_tensor_extras.pop_back(); |
|
} |
|
|
|
temp_tensor_extras_in_use.push_back(extra); |
|
|
|
extra->reset(); |
|
return extra; |
|
} |
|
|
|
ggml_tensor_extra_cl_q4_0 * ggml_opencl_alloc_temp_tensor_extra_q4_0() { |
|
ggml_tensor_extra_cl_q4_0 * extra; |
|
if (temp_tensor_extras_q4_0.empty()) { |
|
extra = new ggml_tensor_extra_cl_q4_0(); |
|
} else { |
|
extra = temp_tensor_extras_q4_0.back(); |
|
temp_tensor_extras_q4_0.pop_back(); |
|
} |
|
|
|
temp_tensor_extras_q4_0_in_use.push_back(extra); |
|
|
|
extra->reset(); |
|
return extra; |
|
} |
|
|
|
void reset() { |
|
for (ggml_tensor_extra_cl * e : temp_tensor_extras_in_use) { |
|
temp_tensor_extras.push_back(e); |
|
} |
|
temp_tensor_extras_in_use.clear(); |
|
|
|
for (ggml_tensor_extra_cl_q4_0 * e : temp_tensor_extras_q4_0_in_use) { |
|
temp_tensor_extras_q4_0.push_back(e); |
|
} |
|
temp_tensor_extras_q4_0_in_use.clear(); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras; |
|
std::vector<ggml_tensor_extra_cl *> temp_tensor_extras_in_use; |
|
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0; |
|
std::vector<ggml_tensor_extra_cl_q4_0 *> temp_tensor_extras_q4_0_in_use; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::vector<cl_mem> buffer; |
|
|
|
|
|
|
|
|
|
std::vector<cl_mem> img; |
|
std::string name; |
|
}; |
|
|
|
static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; |
|
|
|
static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { |
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
|
delete ctx; |
|
} |
|
|
|
static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { |
|
return cl_ptr_base; |
|
|
|
GGML_UNUSED(buffer); |
|
} |
|
|
|
static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { |
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
|
|
|
ggml_cl2_init(buffer->buft->device); |
|
|
|
if (tensor->view_src != nullptr) { |
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); |
|
|
|
ggml_tensor_extra_cl * view_extra = (ggml_tensor_extra_cl *) tensor->view_src->extra; |
|
GGML_ASSERT(view_extra && "view_extra is nullptr?"); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tensor->extra = view_extra; |
|
} else { |
|
{ |
|
size_t offset = (char *)tensor->data - (char *)cl_ptr_base; |
|
|
|
ggml_tensor_extra_cl * extra = ctx->ggml_opencl_alloc_temp_tensor_extra(); |
|
extra->offset = offset; |
|
extra->data_device = ctx->buffer[0]; |
|
extra->actual_size = ggml_nbytes(tensor); |
|
|
|
tensor->extra = extra; |
|
} |
|
} |
|
} |
|
|
|
|
|
|
|
inline bool use_adreno_kernels(const ggml_tensor *tensor) { |
|
return tensor->ne[0] >= 512 && tensor->ne[1] >= 512 && |
|
tensor->ne[2] == 1 && tensor->ne[3] == 1; |
|
} |
|
|
|
static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { |
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); |
|
|
|
cl_context context = backend_ctx->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
#ifdef GGML_OPENCL_SOA_Q |
|
|
|
|
|
|
|
|
|
|
|
if (tensor->type == GGML_TYPE_Q4_0) { |
|
|
|
|
|
ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; |
|
GGML_ASSERT(extra_orig && "Tesnors in OpenCL backend should have been allocated and initialized"); |
|
|
|
|
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
|
ggml_tensor_extra_cl_q4_0 * extra = ctx->ggml_opencl_alloc_temp_tensor_extra_q4_0(); |
|
|
|
size_t size_d = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*sizeof(ggml_fp16_t); |
|
size_t size_q = ggml_nelements(tensor)/ggml_blck_size(tensor->type)*ggml_blck_size(tensor->type)/2; |
|
GGML_ASSERT(size_d + size_q == ggml_nbytes(tensor) && "Incorrect tensor size"); |
|
|
|
cl_int err; |
|
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, |
|
ggml_nbytes(tensor), NULL, &err); |
|
CL_CHECK(err); |
|
CL_CHECK(clEnqueueWriteBuffer( |
|
queue, data_device, CL_TRUE, 0, |
|
ggml_nbytes(tensor), data, 0, NULL, NULL)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cl_buffer_region region; |
|
|
|
|
|
|
|
|
|
region.origin = extra_orig->offset + tensor->view_offs + offset; |
|
region.size = size_d; |
|
extra->d = clCreateSubBuffer( |
|
extra_orig->data_device, CL_MEM_READ_WRITE, |
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); |
|
CL_CHECK(err); |
|
|
|
|
|
region.origin = extra_orig->offset + tensor->view_offs + offset + size_d; |
|
region.size = size_q; |
|
extra->q = clCreateSubBuffer( |
|
extra_orig->data_device, CL_MEM_READ_WRITE, |
|
CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); |
|
CL_CHECK(err); |
|
|
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0; |
|
|
|
|
|
if (use_adreno_kernels(tensor)) { |
|
kernel = backend_ctx->kernel_convert_block_q4_0_noshuffle; |
|
} |
|
#else |
|
cl_kernel kernel = backend_ctx->kernel_convert_block_q4_0; |
|
#endif |
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->q)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->d)); |
|
|
|
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
CL_CHECK(clReleaseMemObject(data_device)); |
|
|
|
tensor->extra = extra; |
|
|
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
|
|
|
|
if (use_adreno_kernels(tensor)) { |
|
|
|
|
|
|
|
int M = tensor->ne[1]; |
|
int K = tensor->ne[0]; |
|
|
|
|
|
|
|
|
|
|
|
size_t q_size_bytes = K * M / 8 * sizeof(float); |
|
cl_buffer_region region; |
|
region.origin = 0; |
|
region.size = q_size_bytes; |
|
cl_mem qT_d = clCreateSubBuffer( |
|
backend_ctx->A_q_d_max, |
|
0, |
|
CL_BUFFER_CREATE_TYPE_REGION, |
|
®ion, |
|
&err); |
|
|
|
CL_CHECK(err); |
|
|
|
|
|
size_t d_size_bytes = M * (K / 32) * 2; |
|
region.origin = 0; |
|
region.size = d_size_bytes; |
|
cl_mem dT_d = clCreateSubBuffer( |
|
backend_ctx->A_s_d_max, |
|
0, |
|
CL_BUFFER_CREATE_TYPE_REGION, |
|
®ion, |
|
&err); |
|
|
|
CL_CHECK(err); |
|
|
|
|
|
|
|
|
|
|
|
|
|
cl_mem q_d_image1D; |
|
cl_mem d_d_image1D; |
|
cl_mem qT_d_image1D; |
|
cl_mem dT_d_image1D; |
|
|
|
cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT }; |
|
cl_image_desc img_desc_1d; |
|
|
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
|
img_desc_1d.image_width = M * K / 8 / 4; |
|
img_desc_1d.buffer = extra->q; |
|
q_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
|
CL_CHECK(err); |
|
|
|
img_fmt_1d = { CL_RGBA, CL_FLOAT }; |
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
|
img_desc_1d.image_width = M * K / 8 / 4; |
|
img_desc_1d.buffer = qT_d; |
|
qT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
|
CL_CHECK(err); |
|
|
|
img_fmt_1d = { CL_RGBA, CL_FLOAT }; |
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
|
img_desc_1d.image_width = M * K / 32 / 4 / 2; |
|
img_desc_1d.buffer = extra->d; |
|
d_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
|
CL_CHECK(err); |
|
|
|
img_fmt_1d = { CL_RGBA, CL_FLOAT }; |
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
|
img_desc_1d.image_width = M * K / 32 / 4 / 2; |
|
img_desc_1d.buffer = dT_d; |
|
dT_d_image1D = clCreateImage(context, 0, &img_fmt_1d, &img_desc_1d, NULL, &err); |
|
CL_CHECK(err); |
|
|
|
|
|
|
|
|
|
|
|
int height_q = M / 8; |
|
int width_q = K / 8 / 4; |
|
kernel = backend_ctx->kernel_transpose_16; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_d_image1D)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &qT_d_image1D)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_q)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_q)); |
|
|
|
size_t local_size_q[3] = {4, 16, 1}; |
|
size_t global_size_q[3] = {static_cast<size_t>(width_q), static_cast<size_t>(height_q), 1}; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_q, local_size_q, 0, NULL, &evt)); |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
|
|
|
|
int height_s = M / 8; |
|
int width_s = K / 32 / 8; |
|
|
|
kernel = backend_ctx->kernel_transpose_16; |
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_d_image1D)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &dT_d_image1D)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_s)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_s)); |
|
|
|
size_t local_size_s[3] = {4, 16, 1}; |
|
size_t global_size_s[3] = {static_cast<size_t>(width_s), static_cast<size_t>(height_s), 1}; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_size_s, local_size_s, 0, NULL, &evt)); |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
|
|
|
|
|
|
|
|
|
|
CL_CHECK(clEnqueueCopyBuffer(queue, qT_d, extra->q, 0, 0, q_size_bytes, 0, NULL, &evt)); |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
|
|
|
|
CL_CHECK(clEnqueueCopyBuffer(queue, dT_d, extra->d, 0, 0, d_size_bytes, 0, NULL, &evt)); |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
|
|
|
|
|
|
|
|
CL_CHECK(clReleaseMemObject(qT_d)); |
|
CL_CHECK(clReleaseMemObject(dT_d)); |
|
|
|
|
|
CL_CHECK(clReleaseMemObject(q_d_image1D)); |
|
CL_CHECK(clReleaseMemObject(d_d_image1D)); |
|
CL_CHECK(clReleaseMemObject(qT_d_image1D)); |
|
CL_CHECK(clReleaseMemObject(dT_d_image1D)); |
|
|
|
|
|
|
|
} |
|
#endif |
|
|
|
return; |
|
} |
|
#endif |
|
|
|
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
|
GGML_ASSERT(extra); |
|
|
|
CL_CHECK(clEnqueueWriteBuffer( |
|
queue, extra->data_device, CL_TRUE, extra->offset + offset, |
|
size, data, 0, NULL, NULL)); |
|
|
|
GGML_UNUSED(buffer); |
|
} |
|
|
|
static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { |
|
GGML_ASSERT(tensor->extra); |
|
|
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer->buft->device); |
|
|
|
cl_context context = backend_ctx->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
|
|
CL_CHECK(clFinish(queue)); |
|
|
|
#ifdef GGML_OPENCL_SOA_Q |
|
|
|
|
|
|
|
|
|
|
|
|
|
if (tensor->type == GGML_TYPE_Q4_0) { |
|
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *)tensor->extra; |
|
|
|
cl_int err; |
|
cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, |
|
ggml_nbytes(tensor), NULL, &err); |
|
CL_CHECK(err); |
|
|
|
cl_kernel kernel = backend_ctx->kernel_restore_block_q4_0; |
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra->q)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->d)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &data_device)); |
|
|
|
size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; |
|
size_t local_work_size[] = {1, 1, 1}; |
|
|
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, |
|
global_work_size, local_work_size, 0, NULL, &evt)); |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
CL_CHECK(clEnqueueReadBuffer( |
|
queue, data_device, CL_TRUE, offset, |
|
size, data, 0, NULL, NULL)); |
|
CL_CHECK(clReleaseMemObject(data_device)); |
|
return; |
|
} |
|
#endif |
|
|
|
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
|
|
|
CL_CHECK(clEnqueueReadBuffer( |
|
queue, extra->data_device, CL_TRUE, extra->offset + tensor->view_offs + offset, |
|
size, data, 0, NULL, NULL)); |
|
|
|
GGML_UNUSED(buffer); |
|
} |
|
|
|
static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { |
|
ggml_backend_dev_t dev = buffer->buft->device; |
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(dev); |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
|
for (cl_mem buf : ctx->buffer) { |
|
CL_CHECK(clEnqueueFillBuffer(queue, buf, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL)); |
|
} |
|
CL_CHECK(clFinish(queue)); |
|
} |
|
|
|
static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) { |
|
ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; |
|
ctx->reset(); |
|
} |
|
|
|
static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = { |
|
ggml_backend_opencl_buffer_free_buffer, |
|
ggml_backend_opencl_buffer_get_base, |
|
ggml_backend_opencl_buffer_init_tensor, |
|
NULL, |
|
ggml_backend_opencl_buffer_set_tensor, |
|
ggml_backend_opencl_buffer_get_tensor, |
|
NULL, |
|
ggml_backend_opencl_buffer_clear, |
|
ggml_backend_opencl_buffer_reset, |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
static const char * ggml_backend_opencl_buffer_type_get_name(ggml_backend_buffer_type_t buffer_type) { |
|
return "OpenCL"; |
|
|
|
GGML_UNUSED(buffer_type); |
|
} |
|
|
|
static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { |
|
ggml_backend_opencl_context *backend_ctx = ggml_cl2_init(buffer_type->device); |
|
|
|
|
|
size = std::max(size, (size_t)1); |
|
|
|
cl_int err; |
|
cl_mem mem = clCreateBuffer(backend_ctx->context, CL_MEM_READ_WRITE, size, NULL, &err); |
|
if (err != CL_SUCCESS) { |
|
GGML_LOG_INFO("%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); |
|
return nullptr; |
|
} |
|
|
|
ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context(mem); |
|
|
|
return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size); |
|
} |
|
|
|
static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { |
|
|
|
static cl_uint alignment = -1; |
|
if (alignment == (cl_uint)-1) { |
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); |
|
alignment = backend_ctx->alignment; |
|
} |
|
return alignment; |
|
} |
|
|
|
static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) { |
|
static size_t max_size = -1; |
|
if (max_size == (size_t)-1) { |
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(buffer_type->device); |
|
max_size = backend_ctx->max_alloc_size; |
|
} |
|
return max_size; |
|
} |
|
|
|
static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { |
|
return ggml_backend_is_opencl(backend); |
|
|
|
UNUSED(buft); |
|
} |
|
|
|
static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { |
|
ggml_backend_opencl_buffer_type_get_name, |
|
ggml_backend_opencl_buffer_type_alloc_buffer, |
|
ggml_backend_opencl_buffer_type_get_alignment, |
|
ggml_backend_opencl_buffer_type_get_max_size, |
|
NULL, |
|
NULL, |
|
}; |
|
|
|
ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { |
|
static ggml_backend_buffer_type buffer_type = { |
|
ggml_backend_opencl_buffer_type_interface, |
|
&g_ggml_backend_opencl_device, |
|
nullptr, |
|
}; |
|
|
|
return &buffer_type; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
static const char * ggml_backend_opencl_device_get_name(ggml_backend_dev_t dev) { |
|
return "GPUOpenCL"; |
|
|
|
GGML_UNUSED(dev); |
|
} |
|
|
|
static const char * ggml_backend_opencl_device_get_description(ggml_backend_dev_t dev) { |
|
ggml_backend_opencl_device_context *dev_ctx = (ggml_backend_opencl_device_context *) dev->context; |
|
return dev_ctx->device_name.c_str(); |
|
} |
|
|
|
static void ggml_backend_opencl_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { |
|
*free = 1; |
|
*total = 1; |
|
|
|
GGML_UNUSED(dev); |
|
} |
|
|
|
static enum ggml_backend_dev_type ggml_backend_opencl_device_get_type(ggml_backend_dev_t dev) { |
|
return GGML_BACKEND_DEVICE_TYPE_GPU; |
|
|
|
GGML_UNUSED(dev); |
|
} |
|
|
|
static void ggml_backend_opencl_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { |
|
props->name = ggml_backend_opencl_device_get_name(dev); |
|
props->description = ggml_backend_opencl_device_get_description(dev); |
|
props->type = ggml_backend_opencl_device_get_type(dev); |
|
ggml_backend_opencl_device_get_memory(dev, &props->memory_free, &props->memory_total); |
|
props->caps = ggml_backend_dev_caps { |
|
false, |
|
false, |
|
false, |
|
false, |
|
}; |
|
} |
|
|
|
static ggml_backend_t ggml_backend_opencl_device_init(ggml_backend_dev_t dev, const char * params) { |
|
ggml_backend_opencl_context * backend_ctx = ggml_cl2_init(dev); |
|
|
|
ggml_backend_t backend = new ggml_backend { |
|
ggml_backend_opencl_guid(), |
|
ggml_backend_opencl_i, |
|
dev, |
|
backend_ctx, |
|
}; |
|
|
|
return backend; |
|
|
|
GGML_UNUSED(params); |
|
} |
|
|
|
static ggml_backend_buffer_type_t ggml_backend_opencl_device_get_buffer_type(ggml_backend_dev_t dev) { |
|
return ggml_backend_opencl_buffer_type(); |
|
|
|
GGML_UNUSED(dev); |
|
} |
|
|
|
static ggml_backend_buffer_t ggml_backend_opencl_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { |
|
GGML_UNUSED(dev); |
|
GGML_UNUSED(ptr); |
|
GGML_UNUSED(size); |
|
GGML_UNUSED(max_tensor_size); |
|
return nullptr; |
|
} |
|
|
|
static bool ggml_backend_opencl_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { |
|
return ggml_opencl_supports_op(dev, op); |
|
} |
|
|
|
static bool ggml_backend_opencl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { |
|
return buft->iface.get_name == ggml_backend_opencl_buffer_type_get_name; |
|
|
|
GGML_UNUSED(dev); |
|
} |
|
|
|
static struct ggml_backend_device_i ggml_backend_opencl_device_i = { |
|
ggml_backend_opencl_device_get_name, |
|
ggml_backend_opencl_device_get_description, |
|
ggml_backend_opencl_device_get_memory, |
|
ggml_backend_opencl_device_get_type, |
|
ggml_backend_opencl_device_get_props, |
|
ggml_backend_opencl_device_init, |
|
ggml_backend_opencl_device_get_buffer_type, |
|
NULL, |
|
ggml_backend_opencl_device_buffer_from_ptr, |
|
ggml_backend_opencl_device_supports_op, |
|
ggml_backend_opencl_device_supports_buft, |
|
NULL, |
|
NULL, |
|
NULL, |
|
NULL, |
|
}; |
|
|
|
|
|
|
|
static const char * ggml_backend_opencl_reg_get_name(ggml_backend_reg_t reg) { |
|
return "OpenCL"; |
|
|
|
GGML_UNUSED(reg); |
|
} |
|
|
|
static size_t ggml_backend_opencl_reg_device_count(ggml_backend_reg_t reg) { |
|
return ggml_backend_opencl_n_devices; |
|
|
|
GGML_UNUSED(reg); |
|
} |
|
|
|
static ggml_backend_dev_t ggml_backend_opencl_reg_device_get(ggml_backend_reg_t reg, size_t index) { |
|
GGML_ASSERT(index == 0); |
|
|
|
return &g_ggml_backend_opencl_device; |
|
|
|
GGML_UNUSED(reg); |
|
GGML_UNUSED(index); |
|
} |
|
|
|
static struct ggml_backend_reg_i ggml_backend_opencl_reg_i = { |
|
ggml_backend_opencl_reg_get_name, |
|
ggml_backend_opencl_reg_device_count, |
|
ggml_backend_opencl_reg_device_get, |
|
NULL, |
|
}; |
|
|
|
ggml_backend_reg_t ggml_backend_opencl_reg(void) { |
|
|
|
static ggml_backend_reg reg; |
|
static bool initialized = false; |
|
|
|
if (!initialized) { |
|
reg = ggml_backend_reg { |
|
GGML_BACKEND_API_VERSION, |
|
ggml_backend_opencl_reg_i, |
|
NULL, |
|
}; |
|
|
|
g_ggml_backend_opencl_device = ggml_backend_device { |
|
ggml_backend_opencl_device_i, |
|
®, |
|
&g_ggml_ctx_dev_main, |
|
}; |
|
|
|
ggml_cl2_init(&g_ggml_backend_opencl_device); |
|
|
|
initialized = true; |
|
} |
|
|
|
return ® |
|
} |
|
|
|
GGML_BACKEND_DL_IMPL(ggml_backend_opencl_reg) |
|
|
|
|
|
|
|
|
|
#if 0 |
|
#define QK4_0 32 |
|
typedef struct { |
|
ggml_fp16_t d; |
|
uint8_t qs[QK4_0 / 2]; |
|
} block_q4_0; |
|
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, |
|
"wrong q4_0 block size/padding"); |
|
|
|
#include <math.h> |
|
#ifdef __cplusplus |
|
#include "half.hpp" |
|
#endif |
|
|
|
static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tensor) { |
|
void * buf = malloc(ggml_nbytes(tensor)); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
#ifdef GGML_OPENCL_SOA_Q |
|
void * buf_q; |
|
void * buf_d; |
|
#endif |
|
|
|
#ifdef GGML_USE_OPENCL |
|
|
|
CL_CHECK(clFinish(queue)); |
|
|
|
#ifdef GGML_OPENCL_SOA_Q |
|
if (tensor->type == GGML_TYPE_Q4_0) { |
|
ggml_tensor_extra_cl_q4_0 * extra = (ggml_tensor_extra_cl_q4_0 *) tensor->extra; |
|
GGML_ASSERT(extra); |
|
|
|
size_t size_q = ggml_nelements(tensor)/QK4_0 * QK4_0/2; |
|
size_t size_d = ggml_nelements(tensor)/QK4_0 * sizeof(ggml_fp16_t); |
|
GGML_ASSERT(size_q + size_d == ggml_nbytes(tensor)); |
|
buf_q = malloc(size_q); |
|
buf_d = malloc(size_d); |
|
|
|
CL_CHECK(clEnqueueReadBuffer(queue, extra->q, CL_TRUE, 0, size_q, buf_q, 0, NULL, NULL)); |
|
CL_CHECK(clEnqueueReadBuffer(queue, extra->d, CL_TRUE, 0, size_d, buf_d, 0, NULL, NULL)); |
|
CL_CHECK(clFinish(queue)); |
|
} else { |
|
|
|
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
|
GGML_ASSERT(extra); |
|
|
|
CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE, |
|
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL)); |
|
CL_CHECK(clFinish(queue)); |
|
} |
|
#else |
|
|
|
ggml_tensor_extra_cl * extra = (ggml_tensor_extra_cl *) tensor->extra; |
|
GGML_ASSERT(extra); |
|
|
|
CL_CHECK(clEnqueueReadBuffer(queue, extra->data_device, CL_TRUE, |
|
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL)); |
|
CL_CHECK(clFinish(queue)); |
|
#endif |
|
#endif |
|
|
|
|
|
char fname[512]; |
|
sprintf(fname, "./tensor-dumps/%s.txt", tensor->name); |
|
FILE * f = fopen(fname, "w"); |
|
if (!f) { |
|
printf("Failed to open %s\n", fname); |
|
return; |
|
} |
|
|
|
if (tensor->type == GGML_TYPE_F32) { |
|
float * data = (float *) buf; |
|
for (int i = 0; i < ggml_nelements(tensor); ++i) { |
|
if (isnan(data[i])) { |
|
printf("NaN found: %s\n", tensor->name); |
|
break; |
|
} |
|
fprintf(f, "%f\n", data[i]); |
|
} |
|
} else if (tensor->type == GGML_TYPE_I32) { |
|
int * data = (int *) buf; |
|
for (int i = 0; i < ggml_nelements(tensor); ++i) { |
|
if (isnan(data[i])) { |
|
printf("NaN found: %s\n", tensor->name); |
|
break; |
|
} |
|
fprintf(f, "%d\n", data[i]); |
|
} |
|
} else if (tensor->type == GGML_TYPE_F16) { |
|
#ifdef __cplusplus |
|
half_float::half * data = (half_float::half *) buf; |
|
for (int i = 0; i < ggml_nelements(tensor); ++i) { |
|
if (std::isnan(data[i])) { |
|
printf("NaN found: %s\n", tensor->name); |
|
break; |
|
} |
|
fprintf(f, "%f\n", float(data[i])); |
|
} |
|
#endif |
|
} else if (tensor->type == GGML_TYPE_Q4_0) { |
|
#ifdef GGML_OPENCL_SOA_Q |
|
ggml_fp16_t * data_d = (ggml_fp16_t *)buf_d; |
|
unsigned char * data_q = (unsigned char *)buf_q; |
|
|
|
for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) { |
|
fprintf(f, "%04x, ", data_d[i]); |
|
for (int k = 0; k < QK4_0/2; ++k) { |
|
fprintf(f, "%02x, ", data_q[k]); |
|
} |
|
fprintf(f, "\n"); |
|
data_q += QK4_0/2; |
|
} |
|
free(buf_d); |
|
free(buf_q); |
|
#else |
|
block_q4_0 * data = (block_q4_0 *) buf; |
|
for (int i = 0; i < ggml_nelements(tensor)/QK4_0; ++i) { |
|
fprintf(f, "%04x, ", data[i].d); |
|
for (int k = 0; k < QK4_0/2; ++k) { |
|
fprintf(f, "%02x, ", data[i].qs[k]); |
|
} |
|
fprintf(f, "\n"); |
|
} |
|
#endif |
|
} |
|
free(buf); |
|
fflush(f); |
|
fclose(f); |
|
} |
|
#else |
|
#define dump_tensor(tensor) |
|
#endif |
|
|
|
|
|
|
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
void populateProfilingInfo( |
|
ProfilingInfo& info, cl_event evt, cl_kernel kernel, |
|
size_t global_size[3], size_t local_size[3], |
|
const ggml_tensor * tensor) { |
|
cl_ulong start; |
|
cl_ulong end; |
|
CL_CHECK(clWaitForEvents(1, &evt)); |
|
CL_CHECK(clGetEventProfilingInfo( |
|
evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL)); |
|
CL_CHECK(clGetEventProfilingInfo( |
|
evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL)); |
|
|
|
char kernel_name[512]; |
|
CL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, |
|
sizeof(kernel_name), kernel_name, NULL)); |
|
|
|
info.duration_ns = end - start; |
|
info.op_name = tensor->name; |
|
info.kernel_name = kernel_name; |
|
info.local_size[0] = local_size[0]; |
|
info.local_size[1] = local_size[1]; |
|
info.local_size[2] = local_size[2]; |
|
info.global_size[0] = global_size[0]; |
|
info.global_size[1] = global_size[1]; |
|
info.global_size[2] = global_size[2]; |
|
info.output_size[0] = tensor->ne[0]; |
|
info.output_size[1] = tensor->ne[1]; |
|
info.output_size[2] = tensor->ne[2]; |
|
info.output_size[3] = tensor->ne[3]; |
|
} |
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
static bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { |
|
const int64_t ne10 = src1->ne[0]; |
|
|
|
const int64_t ne0 = dst->ne[0]; |
|
const int64_t ne1 = dst->ne[1]; |
|
|
|
|
|
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && |
|
src1->type == GGML_TYPE_F32 && |
|
dst->type == GGML_TYPE_F32 && |
|
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32); |
|
} |
|
|
|
static void ggml_cl_nop(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
UNUSED(backend); |
|
UNUSED(src0); |
|
UNUSED(src1); |
|
UNUSED(dst); |
|
} |
|
|
|
static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
|
const int ne10 = src1 ? src1->ne[0] : 0; |
|
const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
|
const int ne11 = src1 ? src1->ne[1] : 0; |
|
const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
|
const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_kernel kernel; |
|
|
|
switch (src0->type) { |
|
case GGML_TYPE_F32: |
|
kernel = backend_ctx->kernel_get_rows_f32; |
|
break; |
|
case GGML_TYPE_F16: |
|
kernel = backend_ctx->kernel_get_rows_f16; |
|
break; |
|
case GGML_TYPE_Q4_0: |
|
kernel = backend_ctx->kernel_get_rows_q4_0; |
|
break; |
|
default: |
|
GGML_ASSERT(false && "not implemented"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb10)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb11)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb1)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb2)); |
|
|
|
size_t global_work_size[] = {(size_t)ne10, (size_t)ne11, 1}; |
|
size_t local_work_size[] = {1, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
|
const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
|
const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
|
const int ne10 = src1 ? src1->ne[0] : 0; |
|
const int ne11 = src1 ? src1->ne[1] : 0; |
|
const int ne12 = src1 ? src1->ne[2] : 0; |
|
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); |
|
|
|
const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
|
const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
|
const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
|
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); |
|
|
|
const int ne0 = dst ? dst->ne[0] : 0; |
|
const int ne1 = dst ? dst->ne[1] : 0; |
|
const int ne2 = dst ? dst->ne[2] : 0; |
|
const int ne3 = dst ? dst->ne[3] : 0; |
|
|
|
const cl_ulong nb0 = dst ? dst->nb[0] : 0; |
|
const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0; |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
bool bcast_row = false; |
|
cl_kernel kernel; |
|
|
|
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { |
|
GGML_ASSERT(ggml_is_contiguous(src0)); |
|
|
|
|
|
GGML_ASSERT(ne11 == 1); |
|
|
|
bcast_row = true; |
|
int ne = ne00 / 4; |
|
kernel = backend_ctx->kernel_add_row; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); |
|
} else { |
|
kernel = backend_ctx->kernel_add; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); |
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); |
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); |
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); |
|
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); |
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); |
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); |
|
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); |
|
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); |
|
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); |
|
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); |
|
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); |
|
} |
|
|
|
if (bcast_row) { |
|
int n = ggml_nelements(dst)/4; |
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} else { |
|
unsigned int nth = MIN(64, ne0); |
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; |
|
size_t local_work_size[] = {nth, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
} |
|
|
|
static void ggml_cl_mul(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
|
const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
|
const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
|
const int ne10 = src1 ? src1->ne[0] : 0; |
|
const int ne11 = src1 ? src1->ne[1] : 0; |
|
const int ne12 = src1 ? src1->ne[2] : 0; |
|
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); |
|
|
|
const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
|
const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
|
const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
|
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13); |
|
|
|
const int ne0 = dst ? dst->ne[0] : 0; |
|
const int ne1 = dst ? dst->ne[1] : 0; |
|
const int ne2 = dst ? dst->ne[2] : 0; |
|
const int ne3 = dst ? dst->ne[3] : 0; |
|
|
|
const cl_ulong nb0 = dst ? dst->nb[0] : 0; |
|
const cl_ulong nb1 = dst ? dst->nb[1] : 0; |
|
const cl_ulong nb2 = dst ? dst->nb[2] : 0; |
|
const cl_ulong nb3 = dst ? dst->nb[3] : 0; |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
bool bcast_row = false; |
|
cl_kernel kernel; |
|
|
|
if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { |
|
GGML_ASSERT(ggml_is_contiguous(src0)); |
|
|
|
|
|
GGML_ASSERT(ne11 == 1); |
|
|
|
bcast_row = true; |
|
int ne = ne00 / 4; |
|
kernel = backend_ctx->kernel_mul_row; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne)); |
|
} else { |
|
kernel = backend_ctx->kernel_mul; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne03)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb00)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb03)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne11)); |
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne13)); |
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb10)); |
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb11)); |
|
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb12)); |
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb13)); |
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &ne2)); |
|
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &ne3)); |
|
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(cl_ulong), &nb0)); |
|
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(cl_ulong), &nb1)); |
|
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(cl_ulong), &nb2)); |
|
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(cl_ulong), &nb3)); |
|
} |
|
|
|
if (bcast_row) { |
|
int n = ggml_nelements(dst)/4; |
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} else { |
|
unsigned int nth = MIN(64, ne0); |
|
size_t global_work_size[] = {ne01*nth, (size_t)ne02, (size_t)ne03}; |
|
size_t local_work_size[] = {nth, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
} |
|
|
|
static void ggml_cl_gelu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_kernel kernel; |
|
|
|
int n = ggml_nelements(dst); |
|
|
|
if (n % 4 == 0) { |
|
kernel = backend_ctx->kernel_gelu_4; |
|
n /= 4; |
|
} else { |
|
kernel = backend_ctx->kernel_gelu; |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
|
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_silu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_kernel kernel; |
|
|
|
int n = ggml_nelements(dst); |
|
|
|
if (n % 4 == 0) { |
|
kernel = backend_ctx->kernel_silu_4; |
|
n /= 4; |
|
} else { |
|
kernel = backend_ctx->kernel_silu; |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
|
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_relu(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_kernel kernel = backend_ctx->kernel_relu; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
|
|
const int64_t n = ggml_nelements(dst); |
|
|
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_clamp(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
float min; |
|
float max; |
|
memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); |
|
memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); |
|
|
|
cl_kernel kernel = backend_ctx->kernel_clamp; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &min)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(float), &max)); |
|
|
|
const int64_t n = ggml_nelements(dst); |
|
|
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
float eps; |
|
memcpy(&eps, dst->op_params, sizeof(float)); |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
|
|
GGML_ASSERT(ggml_is_contiguous_1(src0)); |
|
|
|
const int nth = MIN(64, ne00); |
|
|
|
cl_kernel kernel = backend_ctx->kernel_norm; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth, NULL)); |
|
|
|
const int64_t nrows = ggml_nrows(src0); |
|
|
|
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1}; |
|
size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_rms_norm(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_backend_opencl_device_context * dev_ctx = |
|
(ggml_backend_opencl_device_context *)backend->device->context; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
float eps; |
|
memcpy(&eps, dst->op_params, sizeof(float)); |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
|
|
GGML_ASSERT(ne00 % 4 == 0); |
|
GGML_ASSERT(ggml_is_contiguous_1(src0)); |
|
|
|
const int nth = MIN(64, ne00); |
|
|
|
const int64_t nrows = ggml_nrows(src0); |
|
|
|
size_t global_work_size[] = {(size_t)nrows*nth, 1, 1}; |
|
size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
|
cl_kernel kernel = backend_ctx->kernel_rms_norm; |
|
|
|
|
|
|
|
|
|
|
|
size_t sgs; |
|
CL_CHECK(clGetKernelSubGroupInfo(kernel, dev_ctx->device, |
|
CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE, |
|
sizeof(local_work_size), local_work_size, |
|
sizeof(size_t), &sgs, NULL)); |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(float), &eps)); |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(float)*nth/sgs, NULL)); |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; |
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
#ifdef GGML_OPENCL_SOA_Q |
|
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra; |
|
#endif |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
|
const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
|
const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
|
const int ne10 = src1 ? src1->ne[0] : 0; |
|
const int ne11 = src1 ? src1->ne[1] : 0; |
|
const int ne12 = src1 ? src1->ne[2] : 0; |
|
const int ne13 = src1 ? src1->ne[3] : 0; |
|
|
|
const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
|
const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
|
const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
|
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; |
|
|
|
const int ne0 = dst ? dst->ne[0] : 0; |
|
const int ne1 = dst ? dst->ne[1] : 0; |
|
|
|
int r2 = ne12/ne02; |
|
int r3 = ne13/ne03; |
|
|
|
GGML_ASSERT(ne00 == ne10); |
|
|
|
int nth0 = 32; |
|
int nth1 = 1; |
|
int nrows = 1; |
|
|
|
int ndst = 4; |
|
|
|
cl_kernel kernel; |
|
|
|
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS |
|
cl_context context = backend_ctx->context; |
|
|
|
if (ne01 && ne1 && use_adreno_kernels(src0)) { |
|
|
|
|
|
|
|
cl_int status; |
|
cl_image_format img_fmt_1d; |
|
cl_image_desc img_desc_1d; |
|
cl_buffer_region region; |
|
cl_mem A_image1d = nullptr; |
|
cl_mem B_image1d = nullptr; |
|
cl_mem B_sub_buffer = nullptr; |
|
cl_mem C_d = nullptr; |
|
|
|
cl_mem B_d = nullptr; |
|
cl_mem B_d_input_image = nullptr; |
|
|
|
|
|
|
|
|
|
int M = ne01; |
|
int N = ne1; |
|
int K = ne00; |
|
int padding; |
|
|
|
|
|
|
|
if(src0t == GGML_TYPE_Q4_0 && src1t == GGML_TYPE_F32) { |
|
|
|
|
|
|
|
|
|
if (N == 1) { |
|
img_fmt_1d = { CL_R, CL_UNSIGNED_INT32}; |
|
} else { |
|
img_fmt_1d = { CL_R, CL_FLOAT}; |
|
} |
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
|
img_desc_1d.image_width = M * K / 2 / 4; |
|
img_desc_1d.buffer = extra0_q4_0->q; |
|
A_image1d = clCreateImage( |
|
context, |
|
CL_MEM_READ_ONLY, |
|
&img_fmt_1d, |
|
&img_desc_1d, |
|
NULL, |
|
&status); |
|
CL_CHECK(status); |
|
|
|
|
|
|
|
|
|
|
|
region.origin = (extra1->offset); |
|
region.size = K * N * sizeof(float); |
|
B_sub_buffer = clCreateSubBuffer( |
|
extra1->data_device, |
|
0, |
|
CL_BUFFER_CREATE_TYPE_REGION, |
|
®ion, |
|
&status); |
|
CL_CHECK(status); |
|
|
|
|
|
|
|
if (N != 1) { |
|
|
|
int extra_elements = N % 8; |
|
|
|
|
|
padding = 0; |
|
if (extra_elements > 0){ |
|
padding = 8 - extra_elements; |
|
} |
|
|
|
|
|
region.origin = 0; |
|
|
|
region.size = K * (N + padding) * sizeof(float)/2; |
|
B_d = clCreateSubBuffer( |
|
backend_ctx->B_d_max, |
|
0, |
|
CL_BUFFER_CREATE_TYPE_REGION, |
|
®ion, |
|
&status); |
|
CL_CHECK(status); |
|
|
|
cl_image_format image_format_B_d_input = { CL_RGBA, CL_FLOAT }; |
|
cl_image_desc image_desc_B_d_input = { |
|
CL_MEM_OBJECT_IMAGE1D_BUFFER, |
|
static_cast<size_t>(K * N / 4), |
|
0, 0, 0, 0, 0, 0, 0, { B_sub_buffer } |
|
}; |
|
B_d_input_image = clCreateImage( |
|
context, |
|
0, |
|
&image_format_B_d_input, |
|
&image_desc_B_d_input, |
|
NULL, |
|
&status); |
|
CL_CHECK(status); |
|
|
|
cl_image_format image_format_B_d_output = { CL_RGBA, CL_HALF_FLOAT }; |
|
cl_image_desc image_desc_B_d_output = { |
|
CL_MEM_OBJECT_IMAGE1D_BUFFER, |
|
static_cast<size_t>(K * (N + padding)/4), |
|
0, 0, 0, 0, 0, 0, 0, { B_d } |
|
}; |
|
B_image1d = clCreateImage( |
|
context, |
|
0, |
|
&image_format_B_d_output, |
|
&image_desc_B_d_output, |
|
NULL, |
|
&status); |
|
CL_CHECK(status); |
|
|
|
int height_B = N/4; |
|
int width_B = K/4; |
|
int padded_height_B = (N + padding)/4; |
|
|
|
kernel = backend_ctx->kernel_transpose_32_16; |
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &B_d_input_image)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_image1d)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); |
|
|
|
size_t local_size_t[2] = { 1, 16 }; |
|
|
|
if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { |
|
local_size_t[0]=4; |
|
local_size_t[1]=8; |
|
} else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { |
|
local_size_t[0]=2; |
|
local_size_t[1]=8; |
|
} else if(ne0 == 4096 && ne1 == 128 && ne10 == 11008) { |
|
local_size_t[0]=1; |
|
local_size_t[1]=8; |
|
} else if(ne0 == 32000 && ne1 == 128 && ne10 == 4096) { |
|
local_size_t[0]=2; |
|
local_size_t[1]=8; |
|
} |
|
|
|
size_t global_size_t[2] = { |
|
static_cast<size_t>(width_B), |
|
static_cast<size_t>(padded_height_B) |
|
}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_size_t, local_size_t, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_size_t, local_size_t, 0, NULL, NULL)); |
|
#endif |
|
} else { |
|
|
|
|
|
|
|
img_fmt_1d = {CL_RGBA, CL_FLOAT}; |
|
|
|
memset(&img_desc_1d, 0, sizeof(img_desc_1d)); |
|
img_desc_1d.image_width = K * N / 4; |
|
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; |
|
img_desc_1d.buffer = B_sub_buffer; |
|
B_image1d = clCreateImage( |
|
context, |
|
CL_MEM_READ_ONLY, |
|
&img_fmt_1d, |
|
&img_desc_1d, |
|
NULL, |
|
&status); |
|
CL_CHECK(status); |
|
|
|
} |
|
|
|
|
|
|
|
if (N == 1) { |
|
kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_general; |
|
if (M == 4096 && K == 4096) { |
|
kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_4096; |
|
} else if (M == 4096 && K == 11008) { |
|
kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_4096_1_11008; |
|
} else if (M == 11008 && K == 4096) { |
|
kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_11008_1_4096; |
|
} else if (M == 32000 && K == 4096) { |
|
kernel = backend_ctx->CL_mul_mat_vec_q4_0_f32_1d_4x_flat_32000_1_4096; |
|
} |
|
} else { |
|
kernel = backend_ctx->CL_mul_mat_Ab_Bi_8x4; |
|
} |
|
|
|
|
|
|
|
|
|
cl_uint k_arg = 0; |
|
|
|
if (N == 1) { |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &A_image1d)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extra0_q4_0->d)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &B_image1d)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extra1->offset)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(cl_ulong), &extrad->offset)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, k_arg++, sizeof(int), &r3)); |
|
} else { |
|
region.origin = extrad->offset; |
|
region.size = M * N * sizeof(float); |
|
C_d = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &status); |
|
CL_CHECK(status); |
|
|
|
int padded_N = ne1 + padding; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &B_image1d)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &C_d)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &padded_N)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne1)); |
|
} |
|
|
|
|
|
|
|
|
|
size_t global_work_size[3] = { |
|
64, static_cast<size_t>((M+63)/64), static_cast<size_t>((N+31)/32)}; |
|
size_t local_work_size[3] = {64, 2, 4}; |
|
|
|
global_work_size[0] = (size_t)(ceil((float)ne1/8)); |
|
global_work_size[1] = (size_t)(ne01/4); |
|
global_work_size[2] = (size_t)(1); |
|
|
|
local_work_size[0] = (size_t)(1); |
|
local_work_size[1] = (size_t)(128); |
|
local_work_size[2] = (size_t)(1); |
|
|
|
|
|
if (ne0 == 4096 && ne1 == 128 && ne10 == 4096) { |
|
local_work_size[0] = 1; |
|
local_work_size[1] = 128; |
|
} else if (ne0 == 11008 && ne1 == 128 && ne10 == 4096) { |
|
local_work_size[0] = 2; |
|
local_work_size[1] = 64; |
|
} else if (ne0 == 4096 && ne1 == 128 && ne10 == 11008) { |
|
local_work_size[0] = 2; |
|
local_work_size[1] = 64; |
|
} else if (ne0 == 32000 && ne1 == 128 && ne10 == 4096) { |
|
local_work_size[0] = 2; |
|
local_work_size[1] = 64; |
|
} |
|
|
|
if (N == 1) { |
|
local_work_size[0] = backend_ctx->adreno_wave_size; |
|
local_work_size[1] = 4; |
|
local_work_size[2] = 1; |
|
|
|
global_work_size[0] = M / 2; |
|
global_work_size[1] = 4; |
|
global_work_size[2] = 1; |
|
} |
|
|
|
|
|
|
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
|
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
|
|
|
|
|
|
|
|
CL_CHECK(clReleaseMemObject(A_image1d)); |
|
CL_CHECK(clReleaseMemObject(B_sub_buffer)); |
|
CL_CHECK(clReleaseMemObject(B_image1d)); |
|
|
|
if (N != 1) { |
|
CL_CHECK(clReleaseMemObject(B_d)); |
|
CL_CHECK(clReleaseMemObject(B_d_input_image)); |
|
CL_CHECK(clReleaseMemObject(C_d)); |
|
} |
|
|
|
|
|
return; |
|
} |
|
} |
|
#endif |
|
|
|
if (!ggml_is_transposed(src0) && |
|
!ggml_is_transposed(src1) && |
|
src1t == GGML_TYPE_F32 && |
|
ne00%32 == 0 && |
|
ne11 > 2) { |
|
#ifdef GGML_OPENCL_SOA_Q |
|
|
|
switch(src0t) { |
|
case GGML_TYPE_Q4_0: |
|
|
|
GGML_ASSERT(ne11 == ne1); |
|
GGML_ASSERT(ne01 == ne0); |
|
|
|
if (backend_ctx->gpu_family == INTEL) { |
|
nth0 = 16; |
|
nth1 = 1; |
|
|
|
kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_16x_flat; |
|
} else if (backend_ctx->gpu_family == ADRENO) { |
|
nth0 = 64; |
|
nth1 = 1; |
|
|
|
kernel = backend_ctx->kernel_mul_mat_q4_0_f32_1d_8x_flat; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
|
break; |
|
default: |
|
break; |
|
} |
|
|
|
|
|
if (src0t == GGML_TYPE_Q4_0) { |
|
size_t global_work_size[] = {(size_t)(ne01 + 7)/8*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; |
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
|
if (backend_ctx->gpu_family == INTEL) { |
|
|
|
global_work_size[0] = (size_t)(ne01 + 15)/16*nth0; |
|
global_work_size[1] = (size_t)ne11*nth1; |
|
global_work_size[2] = (size_t)ne12*ne13; |
|
} |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
return; |
|
} |
|
#else |
|
|
|
#endif |
|
} |
|
|
|
|
|
switch (src0t) { |
|
case GGML_TYPE_F32: |
|
|
|
GGML_ASSERT(src1t == GGML_TYPE_F32); |
|
kernel = backend_ctx->kernel_mul_mat_f32_f32; |
|
nrows = 4; |
|
|
|
if (backend_ctx->gpu_family == INTEL) { |
|
nth0 = 32; |
|
nth1 = 1; |
|
} else if (backend_ctx->gpu_family == ADRENO) { |
|
nth0 = 64; |
|
nth1 = 1; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); |
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10)); |
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); |
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); |
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); |
|
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3)); |
|
break; |
|
case GGML_TYPE_F16: |
|
|
|
if (backend_ctx->gpu_family == INTEL) { |
|
nth0 = 32; |
|
nth1 = 1; |
|
} else if (backend_ctx->gpu_family == ADRENO) { |
|
nth0 = 64; |
|
nth1 = 1; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
if (src1t == GGML_TYPE_F32) { |
|
if (ne11 * ne12 < 4) { |
|
kernel = backend_ctx->kernel_mul_mat_f16_f32_1row; |
|
} else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { |
|
kernel = backend_ctx->kernel_mul_mat_f16_f32_l4; |
|
nrows = ne11; |
|
} else { |
|
kernel = backend_ctx->kernel_mul_mat_f16_f32; |
|
nrows = 4; |
|
} |
|
} else { |
|
kernel = backend_ctx->kernel_mul_mat_f16_f16; |
|
nrows = 4; |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb00)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb03)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11)); |
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10)); |
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); |
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); |
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); |
|
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r3)); |
|
break; |
|
case GGML_TYPE_Q4_0: |
|
|
|
GGML_ASSERT(ne11 == ne1); |
|
GGML_ASSERT(ne01 == ne0); |
|
|
|
#ifdef GGML_OPENCL_SOA_Q |
|
if (backend_ctx->gpu_family == INTEL) { |
|
nth0 = 16; |
|
nth1 = 1; |
|
|
|
kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; |
|
ndst = 8; |
|
} else if (backend_ctx->gpu_family == ADRENO) { |
|
nth0 = 64; |
|
nth1 = 1; |
|
|
|
kernel = backend_ctx->kernel_mul_mat_q4_0_f32_8x_flat; |
|
ndst =8; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
|
#else |
|
if (backend_ctx->gpu_family == INTEL) { |
|
|
|
|
|
|
|
|
|
nth0 = 16; |
|
nth1 = 1; |
|
|
|
kernel = backend_ctx->kernel_mul_mat_q4_0_f32; |
|
ndst = 4; |
|
} else if (backend_ctx->gpu_family == ADRENO) { |
|
nth0 = 64; |
|
nth1 = 1; |
|
|
|
kernel = backend_ctx->kernel_mul_mat_q4_0_f32_v; |
|
ndst = 4; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
|
#endif |
|
break; |
|
case GGML_TYPE_Q4_1: |
|
case GGML_TYPE_Q8_0: |
|
case GGML_TYPE_Q2_K: |
|
case GGML_TYPE_Q3_K: |
|
case GGML_TYPE_Q4_K: |
|
case GGML_TYPE_Q5_K: |
|
case GGML_TYPE_Q6_K: |
|
kernel = backend_ctx->kernel_mul_mv_q6_K_f32; |
|
|
|
if (backend_ctx->gpu_family == INTEL) { |
|
nth0 = 2; |
|
nth1 = 16; |
|
} else if (backend_ctx->gpu_family == ADRENO) { |
|
nth0 = 2; |
|
nth1 = 64; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &r2)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &r3)); |
|
break; |
|
default: |
|
GGML_ASSERT(false && "not implemented"); |
|
} |
|
|
|
if (src0t == GGML_TYPE_Q4_0 || |
|
src0t == GGML_TYPE_Q4_1 || |
|
src0t == GGML_TYPE_Q8_0 || |
|
src0t == GGML_TYPE_Q2_K) { |
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t global_work_size[] = {(size_t)(ne01 + ndst-1)/ndst*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; |
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} else if (src0t == GGML_TYPE_Q4_K) { |
|
GGML_ASSERT(false && "not implemented"); |
|
} else if (src0t == GGML_TYPE_Q3_K) { |
|
GGML_ASSERT(false && "not implemented"); |
|
} else if (src0t == GGML_TYPE_Q5_K) { |
|
GGML_ASSERT(false && "not implemented"); |
|
} else if (src0t == GGML_TYPE_Q6_K) { |
|
size_t global_work_size[] = {(size_t)(ne01+1)/2*nth0, (size_t)ne11*nth1, (size_t)ne12*ne13}; |
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} else { |
|
int64_t ny = (ne11 + nrows - 1)/nrows; |
|
|
|
size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; |
|
size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
} |
|
|
|
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
GGML_UNUSED(src1); |
|
|
|
GGML_ASSERT(ggml_is_contiguous(src0)); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
float scale; |
|
memcpy(&scale, dst->op_params, sizeof(scale)); |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_kernel kernel = backend_ctx->kernel_scale; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(float), &scale)); |
|
|
|
int n = ggml_nelements(dst)/4; |
|
|
|
size_t global_work_size[] = {(size_t)n, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_cpy(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
|
|
|
|
|
|
UNUSED(dst); |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
|
const cl_ulong nb00 = src0 ? src0->nb[0] : 0; |
|
const cl_ulong nb01 = src0 ? src0->nb[1] : 0; |
|
const cl_ulong nb02 = src0 ? src0->nb[2] : 0; |
|
const cl_ulong nb03 = src0 ? src0->nb[3] : 0; |
|
|
|
const int ne10 = src1 ? src1->ne[0] : 0; |
|
const int ne11 = src1 ? src1->ne[1] : 0; |
|
const int ne12 = src1 ? src1->ne[2] : 0; |
|
const int ne13 = src1 ? src1->ne[3] : 0; |
|
|
|
const cl_ulong nb10 = src1 ? src1->nb[0] : 0; |
|
const cl_ulong nb11 = src1 ? src1->nb[1] : 0; |
|
const cl_ulong nb12 = src1 ? src1->nb[2] : 0; |
|
const cl_ulong nb13 = src1 ? src1->nb[3] : 0; |
|
|
|
const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; |
|
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
|
|
cl_kernel kernel; |
|
|
|
switch (src0t) { |
|
case GGML_TYPE_F32: |
|
switch (src1t) { |
|
case GGML_TYPE_F16: |
|
kernel = backend_ctx->kernel_cpy_f32_f16; |
|
break; |
|
case GGML_TYPE_F32: |
|
kernel = backend_ctx->kernel_cpy_f32_f32; |
|
break; |
|
default: |
|
GGML_ASSERT(false && "not implemented"); |
|
} |
|
break; |
|
case GGML_TYPE_F16: |
|
switch (src1t) { |
|
case GGML_TYPE_F16: |
|
kernel = backend_ctx->kernel_cpy_f16_f16; |
|
break; |
|
case GGML_TYPE_F32: |
|
kernel = backend_ctx->kernel_cpy_f16_f32; |
|
break; |
|
default: |
|
GGML_ASSERT(false && "not implemented"); |
|
} |
|
break; |
|
default: |
|
GGML_ASSERT(false && "not implemented"); |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(int), &ne10)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne11)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne12)); |
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne13)); |
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb10)); |
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb11)); |
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb12)); |
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(cl_ulong), &nb13)); |
|
|
|
const int nth = MIN(64, ne00); |
|
|
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
|
size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, src1); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_dup(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
ggml_cl_cpy(backend, src0, dst, nullptr); |
|
UNUSED(src1); |
|
} |
|
|
|
static void ggml_cl_diag_mask_inf(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
UNUSED(src1); |
|
|
|
int n_past = ((int32_t *)(dst->op_params))[0]; |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_kernel kernel; |
|
|
|
if (ne00%8 == 0) { |
|
kernel = backend_ctx->kernel_diag_mask_inf_8; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); |
|
|
|
size_t global_work_size[] = {(size_t)ne00*ne01*ne02/8, 1, 1}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} else { |
|
kernel = backend_ctx->kernel_diag_mask_inf; |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &n_past)); |
|
|
|
size_t global_work_size[] = {(size_t)ne00, (size_t)ne01, (size_t)ne02}; |
|
size_t local_work_size[] = {64, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
} |
|
|
|
static void ggml_cl_soft_max(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
|
|
|
|
|
|
|
|
if (src1) { |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
} |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
ggml_tensor_extra_cl * extra1 = src1 ? (ggml_tensor_extra_cl *)src1->extra : nullptr; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
cl_ulong offset1 = extra1 ? extra1->offset + src1->view_offs : offset0; |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
|
float scale, max_bias; |
|
memcpy(&scale, dst->op_params + 0, sizeof(float)); |
|
memcpy(&max_bias, dst->op_params + 1, sizeof(float)); |
|
|
|
const int nrows_x = ggml_nrows(src0); |
|
const int nrows_y = src0->ne[1]; |
|
|
|
const int n_head = nrows_x/nrows_y; |
|
const int n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); |
|
|
|
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); |
|
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); |
|
|
|
|
|
|
|
int nth = MIN(32, ne00); |
|
|
|
if (backend_ctx->gpu_family == INTEL) { |
|
|
|
nth = MIN(32, ne00); |
|
} |
|
else if (backend_ctx->gpu_family == ADRENO) { |
|
nth = 64; |
|
} else { |
|
GGML_ASSERT(false && "TODO: Unknown GPU"); |
|
} |
|
|
|
cl_kernel kernel; |
|
|
|
if (ne00%4 == 0) { |
|
kernel = backend_ctx->kernel_soft_max_4; |
|
} else { |
|
kernel = backend_ctx->kernel_soft_max; |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), extra1 ? &extra1->data_device : &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(float), &scale)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(float), &max_bias)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &m0)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &m1)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &n_head_log2)); |
|
|
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
|
size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
static void ggml_cl_rope(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { |
|
GGML_ASSERT(src0); |
|
GGML_ASSERT(src0->extra); |
|
GGML_ASSERT(src1); |
|
GGML_ASSERT(src1->extra); |
|
GGML_ASSERT(dst); |
|
GGML_ASSERT(dst->extra); |
|
|
|
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; |
|
cl_command_queue queue = backend_ctx->queue; |
|
|
|
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra; |
|
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; |
|
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; |
|
|
|
cl_ulong offset0 = extra0->offset + src0->view_offs; |
|
cl_ulong offset1 = extra1->offset + src1->view_offs; |
|
cl_ulong offsetd = extrad->offset + dst->view_offs; |
|
|
|
ggml_tensor * src2 = dst->src[2]; |
|
ggml_tensor_extra_cl * extra2 = src2 ? (ggml_tensor_extra_cl *)src2->extra : nullptr; |
|
|
|
cl_ulong offset2 = extra2 ? extra2->offset + src2->view_offs : offset0; |
|
|
|
const int ne00 = src0 ? src0->ne[0] : 0; |
|
const int ne01 = src0 ? src0->ne[1] : 0; |
|
const int ne02 = src0 ? src0->ne[2] : 0; |
|
const int ne03 = src0 ? src0->ne[3] : 0; |
|
|
|
const int nb00 = src0 ? src0->nb[0] : 0; |
|
const int nb01 = src0 ? src0->nb[1] : 0; |
|
const int nb02 = src0 ? src0->nb[2] : 0; |
|
const int nb03 = src0 ? src0->nb[3] : 0; |
|
|
|
const int ne10 = src1 ? src1->ne[0] : 0; |
|
const int ne11 = src1 ? src1->ne[1] : 0; UNUSED(ne11); |
|
const int ne12 = src1 ? src1->ne[2] : 0; UNUSED(ne12); |
|
const int ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13); |
|
|
|
const int ne0 = dst ? dst->ne[0] : 0; |
|
const int ne1 = dst ? dst->ne[1] : 0; |
|
const int ne2 = dst ? dst->ne[2] : 0; |
|
const int ne3 = dst ? dst->ne[3] : 0; |
|
|
|
const int nb0 = dst ? dst->nb[0] : 0; |
|
const int nb1 = dst ? dst->nb[1] : 0; |
|
const int nb2 = dst ? dst->nb[2] : 0; |
|
const int nb3 = dst ? dst->nb[3] : 0; |
|
|
|
GGML_ASSERT(ne10 == ne02); |
|
|
|
int nth = MIN(64, ne00); |
|
|
|
const int n_past = ((int *) dst->op_params)[0]; |
|
const int n_dims = ((int *) dst->op_params)[1]; |
|
const int mode = ((int *) dst->op_params)[2]; |
|
const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; |
|
|
|
float freq_base; |
|
float freq_scale; |
|
float ext_factor; |
|
float attn_factor; |
|
float beta_fast; |
|
float beta_slow; |
|
|
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); |
|
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); |
|
memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); |
|
memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); |
|
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); |
|
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); |
|
|
|
const bool is_neox = mode & 2; |
|
|
|
cl_kernel kernel; |
|
|
|
if (!is_neox) { |
|
switch (src0->type) { |
|
case GGML_TYPE_F32: |
|
kernel = backend_ctx->kernel_rope_norm_f32; |
|
break; |
|
case GGML_TYPE_F16: |
|
kernel = backend_ctx->kernel_rope_norm_f16; |
|
break; |
|
default: |
|
GGML_ASSERT(false); |
|
}; |
|
} else { |
|
switch (src0->type) { |
|
case GGML_TYPE_F32: |
|
kernel = backend_ctx->kernel_rope_neox_f32; |
|
break; |
|
case GGML_TYPE_F16: |
|
kernel = backend_ctx->kernel_rope_neox_f16; |
|
break; |
|
default: |
|
GGML_ASSERT(false); |
|
}; |
|
} |
|
|
|
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0)); |
|
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1)); |
|
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), extra2 ? &extra2->data_device : &extra0->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2)); |
|
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device)); |
|
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd)); |
|
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00)); |
|
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01)); |
|
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02)); |
|
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne03)); |
|
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb00)); |
|
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb01)); |
|
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb02)); |
|
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(cl_ulong), &nb03)); |
|
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(int), &ne0)); |
|
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(int), &ne1)); |
|
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne2)); |
|
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne3)); |
|
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb0)); |
|
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong), &nb1)); |
|
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong), &nb2)); |
|
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(cl_ulong), &nb3)); |
|
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &n_past)); |
|
CL_CHECK(clSetKernelArg(kernel, 25, sizeof(int), &n_dims)); |
|
CL_CHECK(clSetKernelArg(kernel, 26, sizeof(int), &n_ctx_orig)); |
|
CL_CHECK(clSetKernelArg(kernel, 27, sizeof(float), &freq_base)); |
|
CL_CHECK(clSetKernelArg(kernel, 28, sizeof(float), &freq_scale)); |
|
CL_CHECK(clSetKernelArg(kernel, 29, sizeof(float), &ext_factor)); |
|
CL_CHECK(clSetKernelArg(kernel, 30, sizeof(float), &attn_factor)); |
|
CL_CHECK(clSetKernelArg(kernel, 31, sizeof(float), &beta_fast)); |
|
CL_CHECK(clSetKernelArg(kernel, 32, sizeof(float), &beta_slow)); |
|
|
|
size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; |
|
size_t local_work_size[] = {(size_t)nth, 1, 1}; |
|
|
|
#ifdef GGML_OPENCL_PROFILING |
|
cl_event evt; |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); |
|
|
|
g_profiling_info.emplace_back(); |
|
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst); |
|
#else |
|
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); |
|
#endif |
|
} |
|
|
|
|
|
|
|
|
|
|
|
typedef void (*ggml_cl_func_t)(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); |
|
|
|
bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) { |
|
ggml_cl_func_t func = nullptr; |
|
|
|
ggml_tensor * src0 = tensor->src[0]; |
|
ggml_tensor * src1 = tensor->src[1]; |
|
|
|
const bool any_on_device = tensor->extra |
|
|| (src0 != nullptr && src0->extra) |
|
|| (src1 != nullptr && src1->extra); |
|
|
|
switch (tensor->op) { |
|
case GGML_OP_GET_ROWS: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_get_rows; |
|
break; |
|
case GGML_OP_CPY: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_cpy; |
|
break; |
|
case GGML_OP_DUP: |
|
case GGML_OP_CONT: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_dup; |
|
break; |
|
case GGML_OP_ADD: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
GGML_ASSERT(ggml_is_contiguous(src0)); |
|
GGML_ASSERT(ggml_is_contiguous(src1)); |
|
func = ggml_cl_add; |
|
break; |
|
case GGML_OP_MUL: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_mul; |
|
break; |
|
case GGML_OP_UNARY: |
|
switch (ggml_get_unary_op(tensor)) { |
|
case GGML_UNARY_OP_GELU: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_gelu; |
|
break; |
|
case GGML_UNARY_OP_SILU: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_silu; |
|
break; |
|
case GGML_UNARY_OP_RELU: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_relu; |
|
break; |
|
default: |
|
return false; |
|
} break; |
|
case GGML_OP_CLAMP: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_clamp; |
|
break; |
|
case GGML_OP_NORM: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_norm; |
|
break; |
|
case GGML_OP_RMS_NORM: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_rms_norm; |
|
break; |
|
case GGML_OP_MUL_MAT: |
|
if (!any_on_device && !ggml_cl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { |
|
return false; |
|
} |
|
func = ggml_cl_mul_mat; |
|
break; |
|
case GGML_OP_SCALE: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_scale; |
|
break; |
|
case GGML_OP_RESHAPE: |
|
case GGML_OP_VIEW: |
|
case GGML_OP_PERMUTE: |
|
case GGML_OP_TRANSPOSE: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_nop; |
|
break; |
|
case GGML_OP_DIAG_MASK_INF: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_diag_mask_inf; |
|
break; |
|
case GGML_OP_SOFT_MAX: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_soft_max; |
|
break; |
|
case GGML_OP_ROPE: |
|
if (!any_on_device) { |
|
return false; |
|
} |
|
func = ggml_cl_rope; |
|
break; |
|
default: |
|
return false; |
|
} |
|
|
|
func(backend, tensor->src[0], tensor->src[1], tensor); |
|
return true; |
|
} |
|
|