|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#include "cuda/vector_helpers.cuh" |
|
|
|
extern "C" |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ static inline float norm_squared(float4 first_yuv, float4 second_yuv) |
|
{ |
|
float x = first_yuv.x - second_yuv.x; |
|
float y = first_yuv.y - second_yuv.y; |
|
float z = first_yuv.z - second_yuv.z; |
|
return (x*x) + (y*y) + (z*z); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ static inline float calculate_w(int x, int y, int r, int c, |
|
float4 pixel_value, float4 neighbor_value, |
|
float sigma_space, float sigma_color) |
|
{ |
|
float first_term, second_term; |
|
first_term = (((x - r) * (x - r)) + ((y - c) * (y - c))) / (2 * sigma_space * sigma_space); |
|
second_term = norm_squared(pixel_value, neighbor_value) / (2 * sigma_color * sigma_color); |
|
return __expf(-first_term - second_term); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__device__ static inline void apply_biltaeral( |
|
cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex, cudaTextureObject_t src_tex_V, |
|
uchar *dst_Y, uchar *dst_U, uchar *dst_V, uchar2 *dst_UV, |
|
int width, int height, int width_uv, int height_uv, int pitch, int pitch_uv, |
|
int x, int y, |
|
float sigma_space, float sigma_color, int window_size) |
|
{ |
|
int start_r = x - window_size / 2; |
|
int start_c = y - window_size / 2; |
|
float4 neighbor_pixel = make_float4(0.f, 0.f, 0.f, 0.f); |
|
float Wp = 0.f; |
|
float4 new_pixel_value = make_float4(0.f, 0.f, 0.f, 0.f); |
|
float w = 0.f; |
|
|
|
int channel_ratio = width / width_uv; // ratio between Y channel and UV channels |
|
float4 currrent_pixel; |
|
|
|
if (!src_tex_V) { // format is in nv12 |
|
float2 temp_uv = tex2D<float2>(src_tex, x/channel_ratio, y/channel_ratio) * 255.f; |
|
currrent_pixel.x = tex2D<float>(src_tex_Y, x, y) * 255.f; |
|
currrent_pixel.y = temp_uv.x; |
|
currrent_pixel.z = temp_uv.y; |
|
currrent_pixel.w = 0.f; |
|
} else { // format is fully planar |
|
currrent_pixel = make_float4(tex2D<float>(src_tex_Y, x, y) * 255.f, |
|
tex2D<float>(src_tex, x/channel_ratio, y/channel_ratio) * 255.f, |
|
tex2D<float>(src_tex_V, x/channel_ratio, y/channel_ratio) * 255.f, |
|
0.f); |
|
} |
|
|
|
for (int i=0; i < window_size; i++) |
|
{ |
|
for (int j=0; j < window_size; j++) |
|
{ |
|
int r=start_r+i; |
|
int c=start_c+j; |
|
bool in_bounds=r>=0 && r<width && c>=0 && c<height; |
|
if (in_bounds) |
|
{ |
|
if (!src_tex_V){ |
|
float2 temp_uv = tex2D<float2>(src_tex, r/channel_ratio, c/channel_ratio); |
|
neighbor_pixel=make_float4(tex2D<float>(src_tex_Y, r, c) * 255.f, |
|
temp_uv.x * 255.f, |
|
temp_uv.y * 255.f, 0.f); |
|
} else { |
|
neighbor_pixel=make_float4(tex2D<float>(src_tex_Y, r, c) * 255.f, |
|
tex2D<float>(src_tex, r/channel_ratio, c/channel_ratio) * 255.f, |
|
tex2D<float>(src_tex_V, r/channel_ratio, c/channel_ratio) * 255.f, 0.f); |
|
} |
|
w=calculate_w(x,y,r,c,currrent_pixel,neighbor_pixel,sigma_space,sigma_color); |
|
Wp+=w; |
|
new_pixel_value+= neighbor_pixel*w; |
|
} |
|
} |
|
} |
|
|
|
new_pixel_value = new_pixel_value / Wp; |
|
dst_Y[y*pitch + x] = new_pixel_value.x; |
|
|
|
if (!src_tex_V) { |
|
dst_UV[(y/channel_ratio) * pitch_uv + (x/channel_ratio)] = make_uchar2(new_pixel_value.y, new_pixel_value.z); |
|
} else { |
|
dst_U[(y/channel_ratio) * pitch_uv + (x/channel_ratio)] = new_pixel_value.y; |
|
dst_V[(y/channel_ratio) * pitch_uv + (x/channel_ratio)] = new_pixel_value.z; |
|
} |
|
|
|
return; |
|
} |
|
|
|
|
|
__global__ void Process_uchar(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_U, cudaTextureObject_t src_tex_V, |
|
uchar *dst_Y, uchar *dst_U, uchar *dst_V, |
|
int width, int height, int pitch, |
|
int width_uv, int height_uv, int pitch_uv, |
|
int window_size, float sigmaS, float sigmaR) |
|
{ |
|
|
|
int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
if (y >= height || x >= width) |
|
return; |
|
|
|
apply_biltaeral(src_tex_Y, src_tex_U, src_tex_V, |
|
dst_Y, dst_U, dst_V, (uchar2*)nullptr, |
|
width, height, width_uv, height_uv, pitch, pitch_uv, |
|
x, y, |
|
sigmaS, sigmaR, window_size); |
|
} |
|
|
|
|
|
__global__ void Process_uchar2(cudaTextureObject_t src_tex_Y, cudaTextureObject_t src_tex_UV, cudaTextureObject_t unused1, |
|
uchar *dst_Y, uchar2 *dst_UV, uchar *unused2, |
|
int width, int height, int pitch, |
|
int width_uv, int height_uv, int pitch_uv, |
|
int window_size, float sigmaS, float sigmaR) |
|
{ |
|
int x = blockIdx.x * blockDim.x + threadIdx.x; |
|
int y = blockIdx.y * blockDim.y + threadIdx.y; |
|
if (y >= height || x >= width) |
|
return; |
|
|
|
apply_biltaeral(src_tex_Y, src_tex_UV, (cudaTextureObject_t)nullptr, |
|
dst_Y, (uchar*)nullptr, (uchar*)nullptr, dst_UV, |
|
width, height, width_uv, height_uv, pitch, pitch_uv, |
|
x, y, |
|
sigmaS, sigmaR, window_size); |
|
} |
|
|
|
} |
|
|