csukuangfj

first commit

477da44 5 months ago

18.4 kB

	/****************************************************************************
	*
	* Copyright (c) 2017 - 2018 by Rockchip Corp. All rights reserved.
	*
	* The material in this file is confidential and contains trade secrets
	* of Rockchip Corporation. This is proprietary information owned by
	* Rockchip Corporation. No part of this work may be disclosed,
	* reproduced, copied, transmitted, or used in any way for any purpose,
	* without the express written permission of Rockchip Corporation.
	*
	*****************************************************************************/

	#ifndef _RKNN_MATMUL_API_H
	#define _RKNN_MATMUL_API_H

	#ifdef __cplusplus
	extern "C" {
	#endif

	#include "rknn_api.h"

	typedef rknn_context rknn_matmul_ctx;

	typedef enum _rknn_matmul_quant_type
	{
	RKNN_QUANT_TYPE_PER_LAYER_SYM = 0,
	RKNN_QUANT_TYPE_PER_LAYER_ASYM = 1,
	RKNN_QUANT_TYPE_PER_CHANNEL_SYM = 2,
	RKNN_QUANT_TYPE_PER_CHANNEL_ASYM = 3,
	RKNN_QUANT_TYPE_PER_GROUP_SYM = 4,
	RKNN_QUANT_TYPE_PER_GROUP_ASYM = 5,
	} rknn_matmul_quant_type;

	typedef struct _rknn_quant_params
	{
	char name[RKNN_MAX_NAME_LEN];

	// matmul tensor scale
	float* scale;
	int32_t scale_len;

	// matmul tensor zero point
	int32_t* zp;
	int32_t zp_len;

	} rknn_quant_params;

	typedef enum _rknn_matmul_type
	{
	RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT32 = 1,
	RKNN_INT8_MM_INT8_TO_INT32 = 2,
	RKNN_INT8_MM_INT8_TO_INT8 = 3,
	RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT16 = 4,
	RKNN_FLOAT16_MM_INT8_TO_FLOAT32 = 5,
	RKNN_FLOAT16_MM_INT8_TO_FLOAT16 = 6,
	RKNN_FLOAT16_MM_INT4_TO_FLOAT32 = 7,
	RKNN_FLOAT16_MM_INT4_TO_FLOAT16 = 8,
	RKNN_INT8_MM_INT8_TO_FLOAT32 = 9,
	RKNN_INT4_MM_INT4_TO_INT16 = 10,
	RKNN_INT8_MM_INT4_TO_INT32 = 11,
	} rknn_matmul_type;

	inline static const char* get_matmul_type_string(rknn_matmul_type type)
	{
	switch (type) {
	case RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT32:
	return "RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT32";
	case RKNN_INT8_MM_INT8_TO_INT32:
	return "RKNN_INT8_MM_INT8_TO_INT32";
	case RKNN_INT8_MM_INT8_TO_INT8:
	return "RKNN_INT8_MM_INT8_TO_INT8";
	case RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT16:
	return "RKNN_FLOAT16_MM_FLOAT16_TO_FLOAT16";
	case RKNN_FLOAT16_MM_INT8_TO_FLOAT32:
	return "RKNN_FLOAT16_MM_INT8_TO_FLOAT32";
	case RKNN_FLOAT16_MM_INT8_TO_FLOAT16:
	return "RKNN_FLOAT16_MM_INT8_TO_FLOAT16";
	case RKNN_INT4_MM_INT4_TO_INT16:
	return "RKNN_INT4_MM_INT4_TO_INT16";
	case RKNN_FLOAT16_MM_INT4_TO_FLOAT32:
	return "RKNN_FLOAT16_MM_INT4_TO_FLOAT32";
	case RKNN_FLOAT16_MM_INT4_TO_FLOAT16:
	return "RKNN_FLOAT16_MM_INT4_TO_FLOAT16";
	case RKNN_INT8_MM_INT4_TO_INT32:
	return "RKNN_INT8_MM_INT4_TO_INT32";
	case RKNN_INT8_MM_INT8_TO_FLOAT32:
	return "RKNN_INT8_MM_INT8_TO_FLOAT32";
	default:
	return "UNKNOW";
	}
	}

	typedef struct _rknn_matmul_tensor_attr
	{
	char name[RKNN_MAX_NAME_LEN];

	// indicate A(M, K) or B(K, N) or C(M, N)
	uint32_t n_dims;
	uint32_t dims[RKNN_MAX_DIMS];

	// matmul tensor size
	uint32_t size;

	// matmul tensor data type
	// int8 : A, B
	// int32: C
	rknn_tensor_type type;

	} rknn_matmul_tensor_attr;

	typedef struct _rknn_matmul_io_attr
	{
	// indicate A(M, K) or B(K, N) or C(M, N)
	rknn_matmul_tensor_attr A;
	rknn_matmul_tensor_attr B;
	rknn_matmul_tensor_attr C;
	} rknn_matmul_io_attr;

	/*
	matmul dynamic shape struct
	*/
	typedef struct _rknn_matmul_shape
	{
	int32_t M;
	int32_t K;
	int32_t N;
	} rknn_matmul_shape;

	/*
	the layout of matmul input/output tensor.
	*/
	typedef enum
	{
	RKNN_MM_LAYOUT_NORM = 0,
	RKNN_MM_LAYOUT_NATIVE = 1,
	RKNN_MM_LAYOUT_TP_NORM = 2,
	} rknn_matmul_layout;

	/*
	matmul information struct
	*/
	typedef struct rknn_matmul_info_t
	{
	int32_t M;
	int32_t K; // limit: RK3566/3568: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte;
	// RK3562: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte;
	// RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte,
	// int4 type must be aligned with 32byte;
	int32_t N; // limit: RK3566/3568: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
	// RK3562: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
	// RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte,
	// int4 type must be aligned with 64byte;
	// matmul data type
	// int4: int4(A) x int4(B) -> int16(C)
	// int8: int8(A) x int8(B) -> int32(C)
	// float16: float16(A) x float16(B) -> float32(C)
	rknn_matmul_type type;

	// matmul native layout for B
	// 0: normal layout
	// 1: native layout
	int16_t B_layout;

	// matmul quant type for B
	// A and C only support per layer
	// 0: per layer
	// 1: per channel
	// 2: per group
	int16_t B_quant_type;

	// matmul native layout for A and C
	// 0: normal layout
	// 1: native layout
	int16_t AC_layout;

	// matmul quant type for A and C, only support 0
	int16_t AC_quant_type;

	// iommu domain id, each domain has 4GB of space
	int32_t iommu_domain_id;

	// B_quant_type set 2, group size is enable
	int16_t group_size;

	// reserved field
	int8_t reserved[34];
	} rknn_matmul_info;

	/* rknn_matmul_create

	params:
	rknn_matmul_ctx *ctx the handle of context.
	rknn_matmul_info *info the matmal information.
	rknn_matmul_io_attr *io_attr inputs/output attribute
	return:
	int error code
	*/
	int rknn_matmul_create(rknn_matmul_ctx* ctx, rknn_matmul_info* info, rknn_matmul_io_attr* io_attr);

	/* rknn_matmul_create_dynamic_shape

	params:
	rknn_matmul_ctx *ctx the handle of context.
	rknn_matmul_info *info the matmal information.
	int shape_num the supported shape number of matmul.
	rknn_matmul_shape dynamic_shapes[] the supported M,K,N shape struct array.
	rknn_matmul_io_attr *io_attr the array of inputs and output attribute
	return:
	int error code
	*/
	/*
	原来的info.M, K, N无效
	*/
	int rknn_matmul_create_dynamic_shape(rknn_matmul_ctx* ctx, rknn_matmul_info* info, int shape_num,
	rknn_matmul_shape dynamic_shapes[], rknn_matmul_io_attr io_attrs[]);

	/* rknn_matmul_set_io_mem

	params:
	rknn_matmul_ctx ctx the handle of context.
	rknn_tensor_mem *mem the pointer of tensor memory information.
	rknn_matmul_tensor_attr *attr the attribute of input or output tensor buffer.
	return:
	int error code.

	formula:
	C = A * B,

	limit:
	K max: k <= 10240
	K limit: RK3566/3568: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte;
	RK3562: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte;
	RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 32byte,
	int4 type must be aligned with 32byte;
	N limit: RK3566/3568: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
	RK3562: int8 type must be aligned with 16byte, float16 type must be aligned with 8byte;
	RK3588/3576: int8 type must be aligned with 32byte, float16 type must be aligned with 16byte,
	int4 type must be aligned with 64byte;
	A shape: M x K
	normal layout: (M, K)
	[M1K1, M1K2, ..., M1Kk,
	M2K1, M2K2, ..., M2Kk,
	...
	MmK1, MmK2, ..., MmKk]
	for RK3566/3568：
	int8:
	native layout: (K / 8, M, 8)
	[K1M1, K2M1, ..., K8M1,
	K9M2, K10M2, ..., K16M2,
	...
	K(k-7)Mm, K(k-6)Mm, ..., KkMm]
	float16:
	native layout: (K / 4, M, 4)
	[K1M1, K2M1, ..., K4M1,
	K9M2, K10M2, ..., K8M2,
	...
	K(k-3)Mm, K(k-2)Mm, ..., KkMm]
	for RK3562：
	int8:
	native layout: (K / 16, M, 16)
	[K1M1, K2M1, ..., K16M1,
	K17M2, K18M2, ..., K32M2,
	...
	K(k-15)Mm, K(k-14)Mm, ..., KkMm]
	float16:
	native layout: (K / 8, M, 8)
	[K1M1, K2M1, ..., K8M1,
	K9M2, K10M2, ..., K16M2,
	...
	K(k-7)Mm, K(k-6)Mm, ..., KkMm]
	for RK3588/3576：
	int4:
	native layout: (K / 32, M, 32)
	[K1M1, K2M1, ..., K32M1,
	K33M2, K10M2, ..., K64M2,
	...
	K(k-31)Mm, K(k-30)Mm, ..., KkMm]
	int8:
	native layout: (K / 16, M, 16)
	[K1M1, K2M1, ..., K16M1,
	K17M2, K18M2, ..., K32M2,
	...
	K(k-15)Mm, K(k-14)Mm, ..., KkMm]
	float16:
	native layout: (K / 8, M, 8)
	[K1M1, K2M1, ..., K8M1,
	K9M2, K10M2, ..., K16M2,
	...
	K(k-7)Mm, K(k-6)Mm, ..., KkMm]
	B shape: K x N
	normal layout: (K, N)
	[K1N1, K1N2, ..., K1Nn,
	K2N1, K2N2, ..., K2Nn,
	...
	KkN1, KkN2, ..., KkNn]
	for RK3566/3568：
	int8:
	native layout: (N / 16, K / 32, 16, 32)
	[K1N1, K2N1, ..., K32N1,
	K1N2, K2N2, ..., K32N2,
	...
	K1N16, K2N16, ..., K32N16,
	K33N1, K34N1, ..., K64N1,
	K33N2, K34N2, ..., K64N2,
	...
	K(k-31)N16, K(k-30)N16, ..., KkN16,
	K1N17, K2N17, ..., K32N17,
	K1N18, K2N18, ..., K32N18,
	...
	K(k-31)Nn, K(k-30)Nn, ..., KkNn]
	float16:
	native layout: (N / 8, K / 16, 8, 16)
	[K1N1, K2N1, ..., K16N1,
	K1N2, K2N2, ..., K16N2,
	...
	K1N8, K2N8, ..., K16N8,
	K17N1, K18N1, ..., K32N1,
	K17N2, K18N2, ..., K32N2,
	...
	K(k-15)N8, K(k-30)N8, ..., KkN8,
	K1N9, K2N9, ..., K16N9,
	K1N10, K2N10, ..., K16N10,
	...
	K(k-15)Nn, K(k-14)Nn, ..., KkNn]
	for RK3562：
	int8:
	native layout: (N / 16, K / 32, 16, 32)
	[K1N1, K2N1, ..., K32N1,
	K1N2, K2N2, ..., K32N2,
	...
	K1N16, K2N16, ..., K32N16,
	K33N1, K34N1, ..., K64N1,
	K33N2, K34N2, ..., K64N2,
	...
	K(k-31)N16, K(k-30)N16, ..., KkN16,
	K1N17, K2N17, ..., K32N17,
	K1N18, K2N18, ..., K32N18,
	...
	K(k-31)Nn, K(k-30)Nn, ..., KkNn]
	float16:
	native layout: (N / 8, K / 32, 8, 32)
	[K1N1, K2N1, ..., K32N1,
	K1N2, K2N2, ..., K32N2,
	...
	K1N8, K2N8, ..., K32N8,
	K33N1, K34N1, ..., K64N1,
	K33N2, K34N2, ..., K64N2,
	...
	K(k-31)N8, K(k-30)N8, ..., KkN8,
	K1N9, K2N9, ..., K16N9,
	K1N10, K2N10, ..., K16N10,
	...
	K(k-31)Nn, K(k-30)Nn, ..., KkNn]
	for RK3588：
	when K > 8192, the B data will be split into T segments.
	int T = std::ceil(K / 8192);
	For example: normal layout -> native layout
	K = 20488, N = 4096, T = 3, the data will be split into 3 segments.
	subN = rknn_matmul_io_attr.B.dims[2];
	subK = rknn_matmul_io_attr.B.dims[3];
	(8196, 4096) (4096 / subN, 8196 / subK, subN, subK)
	(K, N) = (20488, 4096) -> (8196, 4096) -> (4096 / subN, 8196 / subK, subN, subK)
	normal layout (4096, 4096) (4096 / subN, 4096 / subK, subN, subK)
	T normal layout T native layout
	It is recommended to use the rknn_B_normal_layout_to_native_layout interface for direct data conversion.
	for RK3576：
	when K > 4096, the B data will be split into T segments.
	int T = std::ceil(K / 4096);
	For example: normal layout -> native layout
	K = 10240, N = 2048, T = 3, the data will be split into 3 segments.
	subN = rknn_matmul_io_attr.B.dims[2];
	subK = rknn_matmul_io_attr.B.dims[3];
	(4096, 2048) (2048 / subN, 4096 / subK, subN, subK)
	(K, N) = (10240, 2048) -> (4096, 2048) -> (2048 / subN, 4096 / subK, subN, subK)
	normal layout (2048, 2048) (2048 / subN, 2048 / subK, subN, subK)
	T normal layout T native layout
	It is recommended to use the rknn_B_normal_layout_to_native_layout interface for direct data conversion.
	for RK3588/3576：
	int4:
	native layout: (N / 64, K / 32, 64, 32)
	[K1N1, K2N1, ..., K32N1,
	K1N2, K2N2, ..., K32N2,
	...
	K1N64, K2N64, ..., K32N64,
	K33N1, K34N1, ..., K64N1,
	K33N2, K34N2, ..., K64N2,
	...
	K(k-31)N64, K(k-30)N64, ..., KkN64,
	K1N65, K2N65, ..., K32N65,
	K1N66, K2N66, ..., K32N66,
	...
	K(k-31)Nn, K(k-30)Nn, ..., KkNn]
	int8:
	native layout: (N / 32, K / 32, 32, 32)
	[K1N1, K2N1, ..., K32N1,
	K1N2, K2N2, ..., K32N2,
	...
	K1N32, K2N32, ..., K32N32,
	K33N1, K34N1, ..., K64N1,
	K33N2, K34N2, ..., K64N2,
	...
	K(k-31)N32, K(k-30)N32, ..., KkN32,
	K1N33, K2N33, ..., K32N33,
	K1N34, K2N34, ..., K32N34,
	...
	K(k-31)Nn, K(k-30)Nn, ..., KkNn]
	float16:
	native layout: (N / 16, K / 32, 16, 32)
	[K1N1, K2N1, ..., K32N1,
	K1N2, K2N2, ..., K32N2,
	...
	K1N16, K2N16, ..., K32N16,
	K33N1, K34N1, ..., K64N1,
	K33N2, K34N2, ..., K64N2,
	...
	K(k-31)N16, K(k-30)N16, ..., KkN16,
	K1N17, K2N17, ..., K32N17,
	K1N18, K2N18, ..., K32N18,
	...
	K(k-31)Nn, K(k-30)Nn, ..., KkNn]
	C shape: M x N
	normal layout: (M, N)
	[M1N1, M1N2, ..., M1Nn,
	M2N1, M2N2, ..., M2Nn,
	...
	MmN1, MmN2, ..., MmNn]
	native layout: (N / 4, M, 4)
	[N1M1, N2M1, ..., N4M1,
	N5M2, N6M2, ..., N8M2,
	...
	N(n-3)Mm, N(n-2)Mm, ..., NnMm]
	for RK3588：
	int4:
	native layout: (N / 8, M, 8)
	[N1M1, N2M1, ..., N8M1,
	N9M2, N10M2, ..., N16M2,
	...
	N(n-7)Mm, N(n-6)Mm, ..., NnMm]
	*/
	int rknn_matmul_set_io_mem(rknn_matmul_ctx ctx, rknn_tensor_mem* mem, rknn_matmul_tensor_attr* attr);

	/* rknn_matmul_set_core_mask

	set rknn core mask.(only support RK3588 in current)

	RKNN_NPU_CORE_AUTO: auto mode, default value
	RKNN_NPU_CORE_0: core 0 mode
	RKNN_NPU_CORE_1: core 1 mode
	RKNN_NPU_CORE_2: core 2 mode
	RKNN_NPU_CORE_0_1: combine core 0/1 mode
	RKNN_NPU_CORE_0_1_2: combine core 0/1/2 mode

	input:
	rknn_matmul_ctx context the handle of context.
	rknn_core_mask core_mask the core mask.
	return:
	int error code.
	*/
	int rknn_matmul_set_core_mask(rknn_matmul_ctx context, rknn_core_mask core_mask);

	/* rknn_matmul_set_quant_params

	set quant params.(only support matmul type RKNN_INT8_MM_INT8_TO_INT8, RKNN_INT8_MM_INT8_TO_INT32)

	input:
	rknn_matmul_ctx context the handle of context.
	rknn_quant_params params quant params.
	return:
	int error code.
	*/
	int rknn_matmul_set_quant_params(rknn_matmul_ctx context, rknn_quant_params* params);

	/* rknn_matmul_get_quant_params

	get per channel quant params.(only support matmul type RKNN_INT8_MM_INT8_TO_INT32)

	input:
	rknn_matmul_ctx context the handle of context.
	rknn_quant_params params quant params.
	float scale get scale for user.
	return:
	int error code.
	*/
	int rknn_matmul_get_quant_params(rknn_matmul_ctx ctx, rknn_quant_params* params, float* scale);

	/* rknn_matmul_set_dynamic_shape

	set the matmul input/output shape. matmul will run under current input shape after rknn_matmul_set_dynamic_shape,
	only support M dynamicly now.

	input:
	rknn_matmul_ctx ctx the handle of context.
	rknn_matmul_shape* shape the M,K,N shape of matmul currently
	return:
	int error code.
	*/
	int rknn_matmul_set_dynamic_shape(rknn_matmul_ctx ctx, rknn_matmul_shape* shape);

	/* rknn_matmul_run

	run the matmul in blocking mode

	params:
	rknn_matmul_ctx ctx the handle of context.
	return:
	int error code.
	*/
	int rknn_matmul_run(rknn_matmul_ctx ctx);

	/* rknn_matmul_destroy

	destroy the matmul context

	params:
	rknn_matmul_ctx ctx the handle of context.
	return:
	int error code.
	*/
	int rknn_matmul_destroy(rknn_matmul_ctx ctx);

	/* rknn_B_normal_layout_to_native_layout

	change the B normal layout buffer to native layout buffer

	params:
	void* B_input B normal layout buffer.
	void* B_output B native layout buffer.
	int K K
	int N N
	rknn_matmul_info info matmul info
	return:
	int error code.
	*/
	int rknn_B_normal_layout_to_native_layout(void* B_input, void* B_output, int K, int N, rknn_matmul_info* info);

	#ifdef __cplusplus
	} // extern "C"
	#endif

	#endif // _RKNN_MATMUL_API_H