Spaces:
Sleeping
Sleeping
// GGML CPU internal header | |
//#include <stddef.h> | |
extern "C" { | |
struct ggml_compute_params { | |
// ith = thread index, nth = number of threads | |
int ith, nth; | |
// work buffer for all threads | |
size_t wsize; | |
void * wdata; | |
struct ggml_threadpool * threadpool; | |
}; | |
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 | |
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available | |
// 16-bit float | |
// on Arm, we use __fp16 | |
// on x86, we use uint16_t | |
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: | |
// | |
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ | |
// | |
typedef uint16_t ggml_fp16_internal_t; | |
typedef __fp16 ggml_fp16_internal_t; | |
// 32-bit ARM compatibility | |
// vaddlvq_s16 | |
// vpaddq_s16 | |
// vpaddq_s32 | |
// vaddvq_s32 | |
// vaddvq_f32 | |
// vmaxvq_f32 | |
// vcvtnq_s32_f32 | |
// vzip1_u8 | |
// vzip2_u8 | |
inline static int32_t vaddlvq_s16(int16x8_t v) { | |
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v))); | |
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2); | |
} | |
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { | |
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); | |
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); | |
return vcombine_s16(a0, b0); | |
} | |
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { | |
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); | |
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); | |
return vcombine_s32(a0, b0); | |
} | |
inline static int32_t vaddvq_s32(int32x4_t v) { | |
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); | |
} | |
inline static float vaddvq_f32(float32x4_t v) { | |
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); | |
} | |
inline static float vmaxvq_f32(float32x4_t v) { | |
return | |
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), | |
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); | |
} | |
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { | |
int32x4_t res; | |
res[0] = roundf(vgetq_lane_f32(v, 0)); | |
res[1] = roundf(vgetq_lane_f32(v, 1)); | |
res[2] = roundf(vgetq_lane_f32(v, 2)); | |
res[3] = roundf(vgetq_lane_f32(v, 3)); | |
return res; | |
} | |
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { | |
uint8x8_t res; | |
res[0] = a[0]; res[1] = b[0]; | |
res[2] = a[1]; res[3] = b[1]; | |
res[4] = a[2]; res[5] = b[2]; | |
res[6] = a[3]; res[7] = b[3]; | |
return res; | |
} | |
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { | |
uint8x8_t res; | |
res[0] = a[4]; res[1] = b[4]; | |
res[2] = a[5]; res[3] = b[5]; | |
res[4] = a[6]; res[5] = b[6]; | |
res[6] = a[7]; res[7] = b[7]; | |
return res; | |
} | |
// vld1q_s16_x2 | |
// vld1q_u8_x2 | |
// vld1q_u8_x4 | |
// vld1q_s8_x2 | |
// vld1q_s8_x4 | |
// TODO: double-check these work correctly | |
typedef struct ggml_int16x8x2_t { | |
int16x8_t val[2]; | |
} ggml_int16x8x2_t; | |
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { | |
ggml_int16x8x2_t res; | |
res.val[0] = vld1q_s16(ptr + 0); | |
res.val[1] = vld1q_s16(ptr + 8); | |
return res; | |
} | |
typedef struct ggml_uint8x16x2_t { | |
uint8x16_t val[2]; | |
} ggml_uint8x16x2_t; | |
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { | |
ggml_uint8x16x2_t res; | |
res.val[0] = vld1q_u8(ptr + 0); | |
res.val[1] = vld1q_u8(ptr + 16); | |
return res; | |
} | |
typedef struct ggml_uint8x16x4_t { | |
uint8x16_t val[4]; | |
} ggml_uint8x16x4_t; | |
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { | |
ggml_uint8x16x4_t res; | |
res.val[0] = vld1q_u8(ptr + 0); | |
res.val[1] = vld1q_u8(ptr + 16); | |
res.val[2] = vld1q_u8(ptr + 32); | |
res.val[3] = vld1q_u8(ptr + 48); | |
return res; | |
} | |
typedef struct ggml_int8x16x2_t { | |
int8x16_t val[2]; | |
} ggml_int8x16x2_t; | |
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { | |
ggml_int8x16x2_t res; | |
res.val[0] = vld1q_s8(ptr + 0); | |
res.val[1] = vld1q_s8(ptr + 16); | |
return res; | |
} | |
typedef struct ggml_int8x16x4_t { | |
int8x16_t val[4]; | |
} ggml_int8x16x4_t; | |
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { | |
ggml_int8x16x4_t res; | |
res.val[0] = vld1q_s8(ptr + 0); | |
res.val[1] = vld1q_s8(ptr + 16); | |
res.val[2] = vld1q_s8(ptr + 32); | |
res.val[3] = vld1q_s8(ptr + 48); | |
return res; | |
} | |
// NOTE: not tested | |
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { | |
int8x16_t res; | |
res[ 0] = a[b[ 0]]; | |
res[ 1] = a[b[ 1]]; | |
res[ 2] = a[b[ 2]]; | |
res[ 3] = a[b[ 3]]; | |
res[ 4] = a[b[ 4]]; | |
res[ 5] = a[b[ 5]]; | |
res[ 6] = a[b[ 6]]; | |
res[ 7] = a[b[ 7]]; | |
res[ 8] = a[b[ 8]]; | |
res[ 9] = a[b[ 9]]; | |
res[10] = a[b[10]]; | |
res[11] = a[b[11]]; | |
res[12] = a[b[12]]; | |
res[13] = a[b[13]]; | |
res[14] = a[b[14]]; | |
res[15] = a[b[15]]; | |
return res; | |
} | |
// NOTE: not tested | |
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { | |
uint8x16_t res; | |
res[ 0] = a[b[ 0]]; | |
res[ 1] = a[b[ 1]]; | |
res[ 2] = a[b[ 2]]; | |
res[ 3] = a[b[ 3]]; | |
res[ 4] = a[b[ 4]]; | |
res[ 5] = a[b[ 5]]; | |
res[ 6] = a[b[ 6]]; | |
res[ 7] = a[b[ 7]]; | |
res[ 8] = a[b[ 8]]; | |
res[ 9] = a[b[ 9]]; | |
res[10] = a[b[10]]; | |
res[11] = a[b[11]]; | |
res[12] = a[b[12]]; | |
res[13] = a[b[13]]; | |
res[14] = a[b[14]]; | |
res[15] = a[b[15]]; | |
return res; | |
} | |
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { | |
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); | |
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); | |
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); | |
} | |
/* float type data load instructions */ | |
static __m128 __lsx_vreplfr2vr_s(const float val) { | |
v4f32 res = {val, val, val, val}; | |
return (__m128)res; | |
} | |
static __m256 __lasx_xvreplfr2vr_s(const float val) { | |
v8f32 res = {val, val, val, val, val, val, val, val}; | |
return (__m256)res; | |
} | |
// TODO: move to ggml-threading | |
void ggml_barrier(struct ggml_threadpool * tp); | |
} | |