csukuangfj

first commit

477da44 5 months ago

11.6 kB


	#include <time.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <vector>
	#include <cmath>
	#include "fp16/Float16.h"

	namespace rknpu2
	{

	/**
	* @brief generate random buffer
	*
	*/
	template <typename T>
	void generate_random_buffer(T *buffer, size_t size, std::vector<float> range)
	{
	if (buffer == nullptr \|\| size == 0)
	{
	return;
	}
	// 设置随机种子
	srand((unsigned)time(NULL));

	float min = range[0], max = range[1];
	for (size_t i = 0; i < size; ++i)
	{
	buffer[i] = static_cast<T>(min + (max - min) * (static_cast<double>(rand()) / RAND_MAX));
	}
	}

	template void generate_random_buffer(int8_t *buffer, size_t size, std::vector<float> range);
	template void generate_random_buffer(float16 *buffer, size_t size, std::vector<float> range);

	/**
	* @brief convert norm layout to perf layout
	* norm layout: [M,K]
	* perf layout: [K/subK, M, subK]
	*/
	template <typename Ti, typename To>
	void norm_layout_to_perf_layout(Ti src, To dst, int32_t M, int32_t K, int32_t subK, bool isInt4Type)
	{
	int outter_size = (int)std::ceil(K * 1.0f / subK);
	for (int i = 0; i < outter_size; i++)
	{
	for (int m = 0; m < M; m++)
	{
	for (int j = 0; j < subK; j++)
	{
	int ki = i * subK + j;
	if (isInt4Type)
	{
	int input_index = m * K + ki;
	int output_index = i * M * subK + m * subK + j;
	int8_t int4 = src[input_index];
	if (ki >= K)
	{
	int4 = 0;
	}
	else
	{
	int4 = int4 & 0xf;
	}
	if (output_index % 2 == 0)
	{
	dst[output_index / 2] = int4;
	}
	else
	{
	int8_t temp = dst[output_index / 2];
	int8_t result = temp \| (int4 << 4);
	dst[output_index / 2] = result;
	}
	}
	else
	{
	if (ki >= K)
	{
	dst[i * M * subK + m * subK + j] = 0;
	}
	else
	{
	dst[i * M * subK + m * subK + j] = src[m * K + ki];
	}
	}
	}
	}
	}
	}

	template void norm_layout_to_perf_layout<int8_t, int8_t>(int8_t src, int8_t dst, int32_t M, int32_t K, int32_t subK,
	bool isInt4Type);
	template void norm_layout_to_perf_layout<float16, float16>(float16 src, float16 dst, int32_t M, int32_t K,
	int32_t subK, bool isInt4Type);

	/**
	* @brief convert norm layout to native layout
	* norm layout: [K,N]
	* native layout: [N1, K1, subN, subK]
	*
	*/
	template <typename Ti, typename To>
	void norm_layout_to_native_layout(Ti src, To dst, int32_t K, int32_t N, int32_t subN, int32_t subK, bool isInt4Type)
	{
	int N_remain = (int)std::ceil(N * 1.0f / subN);
	int K_remain = (int)std::ceil(K * 1.0f / subK);
	for (int i = 0; i < N_remain; i++)
	{
	for (int j = 0; j < K_remain; j++)
	{
	for (int n = 0; n < subN; n++)
	{
	int ni = i * subN + n;
	for (int k = 0; k < subK; k++)
	{
	int ki = j * subK + k;
	if (isInt4Type)
	{
	int input_index = ki * N + ni;
	int output_index = i * (K_remain * subN * subK) + j * (subN * subK) + n * subK + k;
	int8_t int4 = src[input_index];
	if (ki < K && ni < N)
	{
	int4 = int4 & 0xf;
	}
	else
	{
	int4 = 0;
	}
	if (output_index % 2 == 0)
	{
	dst[output_index / 2] = int4 << 4;
	}
	else
	{
	int8_t temp = dst[output_index / 2];
	int8_t result = temp \| int4;
	dst[output_index / 2] = result;
	}
	}
	else
	{
	if (ki < K && ni < N)
	{
	dst[((i * K_remain + j) * subN + n) * subK + k] = src[ki * N + ni];
	}
	else
	{
	dst[((i * K_remain + j) * subN + n) * subK + k] = 0;
	}
	}
	}
	}
	}
	}
	}

	template void norm_layout_to_native_layout<int8_t, int8_t>(int8_t src, int8_t dst, int32_t K, int32_t N, int32_t subN,
	int32_t subK, bool isInt4Type);
	template void norm_layout_to_native_layout<float16, float16>(float16 src, float16 dst, int32_t K, int32_t N,
	int32_t subN, int32_t subK, bool isInt4Type);

	/**
	* @brief convert perf to norm layout
	* perf layout: [K1, M, subK]
	* norm layout: [M,K]
	*
	*/
	template <typename Ti, typename To>
	void perf_layout_to_norm_layout(Ti src, To dst, int32_t M, int32_t K, int32_t K_remain, int32_t subK)
	{
	for (int i = 0; i < K_remain; i++)
	{
	for (int j = 0; j < subK; j++)
	{
	for (int m = 0; m < M; m++)
	{
	int ki = i * subK + j;
	if (ki < K)
	{
	dst[m * K + ki] = src[i * M * subK + m * subK + j];
	}
	}
	}
	}
	}

	template void perf_layout_to_norm_layout<int8_t, int8_t>(int8_t src, int8_t dst, int32_t M, int32_t K,
	int32_t K_remain, int32_t subK);
	template void perf_layout_to_norm_layout<int16_t, int16_t>(int16_t src, int16_t dst, int32_t M, int32_t K,
	int32_t K_remain, int32_t subK);
	template void perf_layout_to_norm_layout<int32_t, int32_t>(int32_t src, int32_t dst, int32_t M, int32_t K,
	int32_t K_remain, int32_t subK);
	template void perf_layout_to_norm_layout<float, float>(float src, float dst, int32_t M, int32_t K, int32_t K_remain,
	int32_t subK);
	template void perf_layout_to_norm_layout<float16, float16>(float16 src, float16 dst, int32_t M, int32_t K, int32_t K_remain,
	int32_t subK);

	template <typename T>
	bool arraysEqual(const std::vector<T> &arr1, const std::vector<T> &arr2, float eps)
	{
	if (arr1.size() != arr2.size())
	{
	return false;
	}

	for (size_t i = 0; i < arr1.size(); ++i)
	{
	if (std::abs(arr1[i] - arr2[i]) > eps)
	{
	return false;
	}
	}

	return true;
	}

	template bool arraysEqual<float>(const std::vector<float> &arr1, const std::vector<float> &arr2, float eps);
	template bool arraysEqual<int32_t>(const std::vector<int32_t> &arr1, const std::vector<int32_t> &arr2, float eps);
	template bool arraysEqual<int16_t>(const std::vector<int16_t> &arr1, const std::vector<int16_t> &arr2, float eps);
	template bool arraysEqual<int8_t>(const std::vector<int8_t> &arr1, const std::vector<int8_t> &arr2, float eps);

	template <typename T>
	bool arraysCosineSimilarity(const std::vector<T> &arr1, const std::vector<T> &arr2, float eps)
	{
	if (arr1.size() != arr2.size())
	{
	return false;
	}

	// 计算点积
	#pragma omp parallel for reduction(+ : dotProduct)
	double dotProduct = 0.0;
	for (size_t i = 0; i < arr1.size(); ++i)
	{
	dotProduct += arr1[i] * arr2[i];
	}

	// 计算向量范数
	#pragma omp parallel for reduction(+ : normA, normB)
	double normA = 0.0, normB = 0.0;
	for (size_t i = 0; i < arr1.size(); ++i)
	{
	normA += std::pow(arr1[i], 2);
	normB += std::pow(arr2[i], 2);
	}

	// 避免除以零
	if (normA == 0.0 \|\| normB == 0.0)
	{
	return false;
	}

	if ((dotProduct / (std::sqrt(normA) * std::sqrt(normB))) < eps)
	{
	return false;
	}

	return true;
	}
	template bool arraysCosineSimilarity<float>(const std::vector<float> &arr1, const std::vector<float> &arr2, float eps);
	template bool arraysCosineSimilarity<int32_t>(const std::vector<int32_t> &arr1, const std::vector<int32_t> &arr2,
	float eps);
	template bool arraysCosineSimilarity<int16_t>(const std::vector<int16_t> &arr1, const std::vector<int16_t> &arr2,
	float eps);

	// 转置模板函数
	template <typename T>
	void transposeB(const T input, T output, int32_t K, int32_t N)
	{
	for (int32_t k = 0; k < K; ++k)
	{
	for (int32_t n = 0; n < N; ++n)
	{
	output[n * K + k] = input[k * N + n];
	}
	}
	}

	template void transposeB<int8_t>(const int8_t input, int8_t output, int32_t K, int32_t N);
	template void transposeB<float16>(const float16 input, float16 output, int32_t K, int32_t N);

	// 4bit数据类型的特殊处理函数
	void transpose4bit(const int8_t input, int8_t output, int32_t K, int32_t N)
	{
	for (int32_t k = 0; k < K; ++k)
	{
	for (int32_t n = 0; n < N; ++n)
	{
	int32_t input_idx = (k * N + n) / 2;
	int32_t input_offset = (k * N + n) % 2;
	int32_t output_idx = (n * K + k) / 2;
	int32_t output_offset = (n * K + k) % 2;

	uint8_t value = (input[input_idx] >> (4 * input_offset)) & 0xF;

	if (output_offset == 0)
	{
	output[output_idx] = (output[output_idx] & 0xF0) \| value;
	}
	else
	{
	output[output_idx] = (output[output_idx] & 0x0F) \| (value << 4);
	}
	}
	}
	}

	} // namespace rknn