Spaces:

GAP-LAB
/

LASA

Configuration error

App Files Files Community

LASA / evaluation /pyTorchChamferDistance /chamfer_distance /chamfer_distance.cu

HaolinLiu

first commit of codes and update readme.md

cc9780d 10 months ago

raw

history blame

5.07 kB

	#include <ATen/ATen.h>

	#include <cuda.h>
	#include <cuda_runtime.h>

	__global__
	void ChamferDistanceKernel(
	int b,
	int n,
	const float* xyz,
	int m,
	const float* xyz2,
	float* result,
	int* result_i)
	{
	const int batch=512;
	__shared__ float buf[batch*3];
	for (int i=blockIdx.x;i<b;i+=gridDim.x){
	for (int k2=0;k2<m;k2+=batch){
	int end_k=min(m,k2+batch)-k2;
	for (int j=threadIdx.x;j<end_k*3;j+=blockDim.x){
	buf[j]=xyz2[(im+k2)3+j];
	}
	__syncthreads();
	for (int j=threadIdx.x+blockIdx.yblockDim.x;j<n;j+=blockDim.xgridDim.y){
	float x1=xyz[(in+j)3+0];
	float y1=xyz[(in+j)3+1];
	float z1=xyz[(in+j)3+2];
	int best_i=0;
	float best=0;
	int end_ka=end_k-(end_k&3);
	if (end_ka==batch){
	for (int k=0;k<batch;k+=4){
	{
	float x2=buf[k*3+0]-x1;
	float y2=buf[k*3+1]-y1;
	float z2=buf[k*3+2]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (k==0 \|\| d<best){
	best=d;
	best_i=k+k2;
	}
	}
	{
	float x2=buf[k*3+3]-x1;
	float y2=buf[k*3+4]-y1;
	float z2=buf[k*3+5]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (d<best){
	best=d;
	best_i=k+k2+1;
	}
	}
	{
	float x2=buf[k*3+6]-x1;
	float y2=buf[k*3+7]-y1;
	float z2=buf[k*3+8]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (d<best){
	best=d;
	best_i=k+k2+2;
	}
	}
	{
	float x2=buf[k*3+9]-x1;
	float y2=buf[k*3+10]-y1;
	float z2=buf[k*3+11]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (d<best){
	best=d;
	best_i=k+k2+3;
	}
	}
	}
	}else{
	for (int k=0;k<end_ka;k+=4){
	{
	float x2=buf[k*3+0]-x1;
	float y2=buf[k*3+1]-y1;
	float z2=buf[k*3+2]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (k==0 \|\| d<best){
	best=d;
	best_i=k+k2;
	}
	}
	{
	float x2=buf[k*3+3]-x1;
	float y2=buf[k*3+4]-y1;
	float z2=buf[k*3+5]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (d<best){
	best=d;
	best_i=k+k2+1;
	}
	}
	{
	float x2=buf[k*3+6]-x1;
	float y2=buf[k*3+7]-y1;
	float z2=buf[k*3+8]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (d<best){
	best=d;
	best_i=k+k2+2;
	}
	}
	{
	float x2=buf[k*3+9]-x1;
	float y2=buf[k*3+10]-y1;
	float z2=buf[k*3+11]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (d<best){
	best=d;
	best_i=k+k2+3;
	}
	}
	}
	}
	for (int k=end_ka;k<end_k;k++){
	float x2=buf[k*3+0]-x1;
	float y2=buf[k*3+1]-y1;
	float z2=buf[k*3+2]-z1;
	float d=x2x2+y2y2+z2*z2;
	if (k==0 \|\| d<best){
	best=d;
	best_i=k+k2;
	}
	}
	if (k2==0 \|\| result[(i*n+j)]>best){
	result[(i*n+j)]=best;
	result_i[(i*n+j)]=best_i;
	}
	}
	__syncthreads();
	}
	}
	}

	void ChamferDistanceKernelLauncher(
	const int b, const int n,
	const float* xyz,
	const int m,
	const float* xyz2,
	float* result,
	int* result_i,
	float* result2,
	int* result2_i)
	{
	ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, n, xyz, m, xyz2, result, result_i);
	ChamferDistanceKernel<<<dim3(32,16,1),512>>>(b, m, xyz2, n, xyz, result2, result2_i);

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	printf("error in chamfer distance updateOutput: %s\n", cudaGetErrorString(err));
	}


	__global__
	void ChamferDistanceGradKernel(
	int b, int n,
	const float* xyz1,
	int m,
	const float* xyz2,
	const float* grad_dist1,
	const int* idx1,
	float* grad_xyz1,
	float* grad_xyz2)
	{
	for (int i = blockIdx.x; i<b; i += gridDim.x) {
	for (int j = threadIdx.x + blockIdx.y * blockDim.x; j < n; j += blockDim.x*gridDim.y) {
	float x1=xyz1[(in+j)3+0];
	float y1=xyz1[(in+j)3+1];
	float z1=xyz1[(in+j)3+2];
	int j2=idx1[i*n+j];
	float x2=xyz2[(im+j2)3+0];
	float y2=xyz2[(im+j2)3+1];
	float z2=xyz2[(im+j2)3+2];
	float g=grad_dist1[in+j]2;
	atomicAdd(&(grad_xyz1[(in+j)3+0]),g*(x1-x2));
	atomicAdd(&(grad_xyz1[(in+j)3+1]),g*(y1-y2));
	atomicAdd(&(grad_xyz1[(in+j)3+2]),g*(z1-z2));
	atomicAdd(&(grad_xyz2[(im+j2)3+0]),-(g*(x1-x2)));
	atomicAdd(&(grad_xyz2[(im+j2)3+1]),-(g*(y1-y2)));
	atomicAdd(&(grad_xyz2[(im+j2)3+2]),-(g*(z1-z2)));
	}
	}
	}

	void ChamferDistanceGradKernelLauncher(
	const int b, const int n,
	const float* xyz1,
	const int m,
	const float* xyz2,
	const float* grad_dist1,
	const int* idx1,
	const float* grad_dist2,
	const int* idx2,
	float* grad_xyz1,
	float* grad_xyz2)
	{
	cudaMemset(grad_xyz1, 0, bn3*4);
	cudaMemset(grad_xyz2, 0, bm3*4);
	ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, n, xyz1, m, xyz2, grad_dist1, idx1, grad_xyz1, grad_xyz2);
	ChamferDistanceGradKernel<<<dim3(1,16,1), 256>>>(b, m, xyz2, n, xyz1, grad_dist2, idx2, grad_xyz2, grad_xyz1);

	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess)
	printf("error in chamfer distance get grad: %s\n", cudaGetErrorString(err));
	}