Spaces:
Sleeping
Sleeping
// GGML internal header | |
extern "C" { | |
// static_assert should be a #define, but if it's not, | |
// fall back to the _Static_assert C11 keyword. | |
// if C99 - static_assert is noop | |
// ref: https://stackoverflow.com/a/53923785/4039976 | |
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 | |
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available | |
// 16-bit float | |
// on Arm, we use __fp16 | |
// on x86, we use uint16_t | |
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example: | |
// | |
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ | |
// | |
typedef __fp16 ggml_fp16_internal_t; | |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { | |
ggml_fp16_internal_t tmp; | |
memcpy(&tmp, &h, sizeof(ggml_fp16_t)); | |
return (float)tmp; | |
} | |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { | |
ggml_fp16_t res; | |
ggml_fp16_internal_t tmp = f; | |
memcpy(&res, &tmp, sizeof(ggml_fp16_t)); | |
return res; | |
} | |
typedef uint16_t ggml_fp16_internal_t; | |
/* the inline asm below is about 12% faster than the lookup method */ | |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { | |
register float f; | |
register double d; | |
__asm__( | |
"mtfprd %0,%2\n" | |
"xscvhpdp %0,%0\n" | |
"frsp %1,%0\n" : | |
/* temp */ "=d"(d), | |
/* out */ "=f"(f): | |
/* in */ "r"(h)); | |
return f; | |
} | |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { | |
register double d; | |
register ggml_fp16_t r; | |
__asm__( /* xscvdphp can work on double or single precision */ | |
"xscvdphp %0,%2\n" | |
"mffprd %1,%0\n" : | |
/* temp */ "=d"(d), | |
/* out */ "=r"(r): | |
/* in */ "f"(f)); | |
return r; | |
} | |
// FP16 <-> FP32 | |
// ref: https://github.com/Maratyszcza/FP16 | |
static inline float fp32_from_bits(uint32_t w) { | |
union { | |
uint32_t as_bits; | |
float as_value; | |
} fp32; | |
fp32.as_bits = w; | |
return fp32.as_value; | |
} | |
static inline uint32_t fp32_to_bits(float f) { | |
union { | |
float as_value; | |
uint32_t as_bits; | |
} fp32; | |
fp32.as_value = f; | |
return fp32.as_bits; | |
} | |
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { | |
const uint32_t w = (uint32_t) h << 16; | |
const uint32_t sign = w & UINT32_C(0x80000000); | |
const uint32_t two_w = w + w; | |
const uint32_t exp_offset = UINT32_C(0xE0) << 23; | |
const float exp_scale = 0x1.0p-112f; | |
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); | |
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; | |
const uint32_t magic_mask = UINT32_C(126) << 23; | |
const float magic_bias = 0.5f; | |
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; | |
const uint32_t denormalized_cutoff = UINT32_C(1) << 27; | |
const uint32_t result = sign | | |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); | |
return fp32_from_bits(result); | |
} | |
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { | |
const float scale_to_inf = 0x1.0p+112f; | |
const float scale_to_zero = 0x1.0p-110f; | |
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); | |
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); | |
float base = (fabsf(f) * scale_to_inf) * scale_to_zero; | |
const uint32_t w = fp32_to_bits(f); | |
const uint32_t shl1_w = w + w; | |
const uint32_t sign = w & UINT32_C(0x80000000); | |
uint32_t bias = shl1_w & UINT32_C(0xFF000000); | |
if (bias < UINT32_C(0x71000000)) { | |
bias = UINT32_C(0x71000000); | |
} | |
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; | |
const uint32_t bits = fp32_to_bits(base); | |
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); | |
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); | |
const uint32_t nonsign = exp_bits + mantissa_bits; | |
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); | |
} | |
// precomputed f32 table for f16 (256 KB) | |
// defined in ggml.c, initialized in ggml_init() | |
extern float ggml_table_f32_f16[1 << 16]; | |
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, | |
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. | |
// This is also true for POWER9. | |
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { | |
uint16_t s; | |
memcpy(&s, &f, sizeof(uint16_t)); | |
return ggml_table_f32_f16[s]; | |
} | |
struct ggml_hash_set ggml_hash_set_new(size_t size); | |
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted | |
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full | |
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
// return index, asserts if table is full | |
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key); | |
} | |