#include #include #include #include #include #include #include "fp16/Float16.h" namespace rknpu2 { /** * @brief generate random buffer * */ template void generate_random_buffer(T *buffer, size_t size, std::vector range) { if (buffer == nullptr || size == 0) { return; } // 设置随机种子 srand((unsigned)time(NULL)); float min = range[0], max = range[1]; for (size_t i = 0; i < size; ++i) { buffer[i] = static_cast(min + (max - min) * (static_cast(rand()) / RAND_MAX)); } } template void generate_random_buffer(int8_t *buffer, size_t size, std::vector range); template void generate_random_buffer(float16 *buffer, size_t size, std::vector range); /** * @brief convert norm layout to perf layout * norm layout: [M,K] * perf layout: [K/subK, M, subK] */ template void norm_layout_to_perf_layout(Ti *src, To *dst, int32_t M, int32_t K, int32_t subK, bool isInt4Type) { int outter_size = (int)std::ceil(K * 1.0f / subK); for (int i = 0; i < outter_size; i++) { for (int m = 0; m < M; m++) { for (int j = 0; j < subK; j++) { int ki = i * subK + j; if (isInt4Type) { int input_index = m * K + ki; int output_index = i * M * subK + m * subK + j; int8_t int4 = src[input_index]; if (ki >= K) { int4 = 0; } else { int4 = int4 & 0xf; } if (output_index % 2 == 0) { dst[output_index / 2] = int4; } else { int8_t temp = dst[output_index / 2]; int8_t result = temp | (int4 << 4); dst[output_index / 2] = result; } } else { if (ki >= K) { dst[i * M * subK + m * subK + j] = 0; } else { dst[i * M * subK + m * subK + j] = src[m * K + ki]; } } } } } } template void norm_layout_to_perf_layout(int8_t *src, int8_t *dst, int32_t M, int32_t K, int32_t subK, bool isInt4Type); template void norm_layout_to_perf_layout(float16 *src, float16 *dst, int32_t M, int32_t K, int32_t subK, bool isInt4Type); /** * @brief convert norm layout to native layout * norm layout: [K,N] * native layout: [N1, K1, subN, subK] * */ template void norm_layout_to_native_layout(Ti *src, To *dst, int32_t K, int32_t N, int32_t subN, int32_t subK, bool isInt4Type) { int N_remain = (int)std::ceil(N * 1.0f / subN); int K_remain = (int)std::ceil(K * 1.0f / subK); for (int i = 0; i < N_remain; i++) { for (int j = 0; j < K_remain; j++) { for (int n = 0; n < subN; n++) { int ni = i * subN + n; for (int k = 0; k < subK; k++) { int ki = j * subK + k; if (isInt4Type) { int input_index = ki * N + ni; int output_index = i * (K_remain * subN * subK) + j * (subN * subK) + n * subK + k; int8_t int4 = src[input_index]; if (ki < K && ni < N) { int4 = int4 & 0xf; } else { int4 = 0; } if (output_index % 2 == 0) { dst[output_index / 2] = int4 << 4; } else { int8_t temp = dst[output_index / 2]; int8_t result = temp | int4; dst[output_index / 2] = result; } } else { if (ki < K && ni < N) { dst[((i * K_remain + j) * subN + n) * subK + k] = src[ki * N + ni]; } else { dst[((i * K_remain + j) * subN + n) * subK + k] = 0; } } } } } } } template void norm_layout_to_native_layout(int8_t *src, int8_t *dst, int32_t K, int32_t N, int32_t subN, int32_t subK, bool isInt4Type); template void norm_layout_to_native_layout(float16 *src, float16 *dst, int32_t K, int32_t N, int32_t subN, int32_t subK, bool isInt4Type); /** * @brief convert perf to norm layout * perf layout: [K1, M, subK] * norm layout: [M,K] * */ template void perf_layout_to_norm_layout(Ti *src, To *dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK) { for (int i = 0; i < K_remain; i++) { for (int j = 0; j < subK; j++) { for (int m = 0; m < M; m++) { int ki = i * subK + j; if (ki < K) { dst[m * K + ki] = src[i * M * subK + m * subK + j]; } } } } } template void perf_layout_to_norm_layout(int8_t *src, int8_t *dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK); template void perf_layout_to_norm_layout(int16_t *src, int16_t *dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK); template void perf_layout_to_norm_layout(int32_t *src, int32_t *dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK); template void perf_layout_to_norm_layout(float *src, float *dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK); template void perf_layout_to_norm_layout(float16 *src, float16 *dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK); template bool arraysEqual(const std::vector &arr1, const std::vector &arr2, float eps) { if (arr1.size() != arr2.size()) { return false; } for (size_t i = 0; i < arr1.size(); ++i) { if (std::abs(arr1[i] - arr2[i]) > eps) { return false; } } return true; } template bool arraysEqual(const std::vector &arr1, const std::vector &arr2, float eps); template bool arraysEqual(const std::vector &arr1, const std::vector &arr2, float eps); template bool arraysEqual(const std::vector &arr1, const std::vector &arr2, float eps); template bool arraysEqual(const std::vector &arr1, const std::vector &arr2, float eps); template bool arraysCosineSimilarity(const std::vector &arr1, const std::vector &arr2, float eps) { if (arr1.size() != arr2.size()) { return false; } // 计算点积 #pragma omp parallel for reduction(+ : dotProduct) double dotProduct = 0.0; for (size_t i = 0; i < arr1.size(); ++i) { dotProduct += arr1[i] * arr2[i]; } // 计算向量范数 #pragma omp parallel for reduction(+ : normA, normB) double normA = 0.0, normB = 0.0; for (size_t i = 0; i < arr1.size(); ++i) { normA += std::pow(arr1[i], 2); normB += std::pow(arr2[i], 2); } // 避免除以零 if (normA == 0.0 || normB == 0.0) { return false; } if ((dotProduct / (std::sqrt(normA) * std::sqrt(normB))) < eps) { return false; } return true; } template bool arraysCosineSimilarity(const std::vector &arr1, const std::vector &arr2, float eps); template bool arraysCosineSimilarity(const std::vector &arr1, const std::vector &arr2, float eps); template bool arraysCosineSimilarity(const std::vector &arr1, const std::vector &arr2, float eps); // 转置模板函数 template void transposeB(const T *input, T *output, int32_t K, int32_t N) { for (int32_t k = 0; k < K; ++k) { for (int32_t n = 0; n < N; ++n) { output[n * K + k] = input[k * N + n]; } } } template void transposeB(const int8_t *input, int8_t *output, int32_t K, int32_t N); template void transposeB(const float16 *input, float16 *output, int32_t K, int32_t N); // 4bit数据类型的特殊处理函数 void transpose4bit(const int8_t *input, int8_t *output, int32_t K, int32_t N) { for (int32_t k = 0; k < K; ++k) { for (int32_t n = 0; n < N; ++n) { int32_t input_idx = (k * N + n) / 2; int32_t input_offset = (k * N + n) % 2; int32_t output_idx = (n * K + k) / 2; int32_t output_offset = (n * K + k) % 2; uint8_t value = (input[input_idx] >> (4 * input_offset)) & 0xF; if (output_offset == 0) { output[output_idx] = (output[output_idx] & 0xF0) | value; } else { output[output_idx] = (output[output_idx] & 0x0F) | (value << 4); } } } } } // namespace rknn