/**
 *  Copyright 2023-2024 NXP
 *
 * NXP Confidential and Proprietary.
 * This software is owned or controlled by NXP and may only be used strictly
 * in accordance with the applicable license terms.
 * By expressly accepting such terms or by downloading, installing,
 * activating and/or otherwise using the software, you are agreeing that you have read,
 * and that you agree to comply with and are bound by, such license terms.
 * If you do not agree to be bound by the applicable license terms,
 * then you may not retain, install, activate or otherwise use the software.
 *
**/
int get_tile_src_idx(int x, int y, int stride){
	return x * 128 + (y&127) * 8 + (y / 128) * stride * 128;
}
__kernel void NV12TILE_TO_NV12(__global const uchar *in_y,
	__global const uchar *in_uv, int in_uv_offset,
	__global uchar *out_y, __global uchar *out_uv, int out_uv_offset,
	int src_stride, int dst_stride, int deinterlace, int fullrange)
{
	int x = get_global_id(0) * 8;
	int y = get_global_id(1) * 2;
	float y_range = 0.858824f;//219/255
	float uv_range = 0.878431f;//112/127.5

	int src_y_idx;
	int src_y1_idx;
	int src_uv_idx;

	int dst_y_idx = mad24(y, dst_stride, x);
	int dst_y1_idx = mad24(y+1, dst_stride, x);
	int dst_uv_idx = mad24(y/2, dst_stride, x);

	if(deinterlace){
		src_y_idx = get_tile_src_idx(x, y/2, src_stride);
		src_y1_idx = src_y_idx;
		src_uv_idx = get_tile_src_idx(x, y/4, src_stride);
	}else{
		src_y_idx = get_tile_src_idx(x, y, src_stride);
		src_y1_idx = get_tile_src_idx(x, y + 1, src_stride);
		src_uv_idx = get_tile_src_idx(x, y/2, src_stride);
	}

	uchar8 bytes = vload8(0, in_y + src_y_idx);
	uchar8 bytes2 = vload8(0, in_y + src_y1_idx);
	if(fullrange){
		bytes = convert_uchar8_sat_rte(convert_float8(bytes) * y_range + 16);
		bytes2 = convert_uchar8_sat_rte(convert_float8(bytes2) * y_range + 16);
	}
	vstore8(bytes, 0, out_y + dst_y_idx);
	vstore8(bytes2, 0, out_y + dst_y1_idx);

	uchar8 bytes_uv = vload8(0, in_uv + in_uv_offset + src_uv_idx);
	if(fullrange){
		float8 bytes_uv_f = convert_float8(bytes_uv) - 128;
		bytes_uv = convert_uchar8_sat_rte(bytes_uv_f * uv_range + 128);
	}
	vstore8(bytes_uv, 0, out_uv + out_uv_offset + dst_uv_idx);
}
__kernel void NV12_TO_NV12(__global const uchar * in_y, __global uchar *in_uv, int in_uv_offset,
	 __global uchar *out_y, __global uchar *out_uv, int out_uv_offset,
	 int src_stride, int dst_stride, int deinterlace, int range){
	float y_range = 0.858824f;//219/255
	float uv_range = 0.878431f;//112/127.5

	int x = get_global_id(0) * 16;
	int y = get_global_id(1) * 2;

	int dst_y0_idx = mad24(y, dst_stride, x);
	int dst_y1_idx = mad24(y+1, dst_stride, x);
	int dst_uv_idx = mad24(y/2, dst_stride, x);

	int src_y0_idx;
	int src_y1_idx;
	int src_uv_idx;

	if(deinterlace){
		src_y0_idx = mad24(y/2, src_stride, x);
		src_y1_idx = src_y0_idx;
		src_uv_idx = mad24(y/4, src_stride, x);
	}else{
		src_y0_idx = mad24(y, src_stride, x);
		src_y1_idx = mad24(y+1, src_stride, x);
		src_uv_idx = mad24(y/2, src_stride, x);
	}

	uchar16 bytes_y0 = vload16(0, in_y + src_y0_idx);
	uchar16 bytes_y1 = vload16(0, in_y + src_y1_idx);
	uchar16 bytes_uv = vload16(0, in_uv + in_uv_offset + src_uv_idx);
	if(range){
		float16 bytes_y0_f = convert_float16(bytes_y0) * y_range + 16;
		float16 bytes_y1_f = convert_float16(bytes_y1) * y_range + 16;
		bytes_y0 = convert_uchar16_sat_rte(bytes_y0_f);
		bytes_y1 = convert_uchar16_sat_rte(bytes_y1_f);


		float16 bytes_uv_f = convert_float16(bytes_uv) - 128;
		bytes_uv = convert_uchar16_sat_rte(bytes_uv_f * uv_range + 128);
	}
	vstore16(bytes_y0, 0, out_y + dst_y0_idx);
	vstore16(bytes_y1, 0, out_y + dst_y1_idx);
	vstore16(bytes_uv, 0, out_uv + out_uv_offset + dst_uv_idx);
}
void ten_to_eight(__global uchar *input, __global uchar *output, uchar byte_loc, int x){
	uchar byte_cnt = 8 - byte_loc;
	uchar8 input_bytes = vload8(0, input + byte_loc);
	if(byte_cnt < 5){
		uchar4 temp_bytes = vload4(0, input + 1024);
		vstore3(input_bytes.s123, 0, output);
		vstore4(temp_bytes, 0, output+7-byte_loc);
		input_bytes.s1234 = vload4(0, output);
	}
	uint4 shift = {0,10,20,30};
	unsigned int bit_pos = x * 10;
	shift = (shift + bit_pos) & 7;
	
	uchar4 output_bytes = input_bytes.s0123 << shift;
	uint4  temp_output = convert_uint4(input_bytes.s1234);
	output_bytes |= convert_uchar4(temp_output >> (8 - shift));
	vstore4(output_bytes, 0, output);
}
__kernel void NV15TILE_TO_NV12(__global const uchar *in_y,
	__global const uchar *in_uv, int in_uv_offset,
	__global uchar *out_y,
	__global uchar *out_uv, int out_uv_offset,
		int src_stride, int dst_stride)
{
	int x = get_global_id(0) * 4;
	int y = get_global_id(1) * 2;
	int sx = x/4*5;
	uchar byte_loc = sx & 7;
	uchar byte_cnt = 8 - byte_loc;

	int dst_y_idx = mad24(y, dst_stride, x);
	int dst_y1_idx = mad24(y+1, dst_stride, x);
	int dst_uv_idx = mad24(y/2, dst_stride, x);

	sx = sx >> 3 << 3;
	int src_y_idx = get_tile_src_idx(sx, y, src_stride);
	int src_y1_idx = get_tile_src_idx(sx, y + 1, src_stride);
	int src_uv_idx = get_tile_src_idx(sx, y/2, src_stride);

	ten_to_eight(in_y + src_y_idx, out_y + dst_y_idx, byte_loc, x);
	ten_to_eight(in_y + src_y1_idx, out_y + dst_y1_idx, byte_loc, x);
	ten_to_eight(in_uv + in_uv_offset + src_uv_idx, out_uv + out_uv_offset + dst_uv_idx, byte_loc, x);
}
__kernel void I420_TO_NV12(__global const uchar * in_y,
	__global uchar *in_u, int in_u_offset,
	__global uchar *in_v, int in_v_offset,
	__global uchar *out_y, __global uchar *out_uv, int out_uv_offset,
	int src_stride, int dst_stride, int deinterlace, int range)
{
	float y_range = 0.858824f;//219/255
	float uv_range = 0.878431f;//112/127.5
	int x = get_global_id(0) << 4;
	int y = get_global_id(1) << 1;

	int dst_y0_idx = mad24(y, dst_stride, x);
	int dst_y1_idx = mad24(y+1, dst_stride, x);
	int dst_uv_idx = mad24(y >> 1, dst_stride, x);

	int src_y0_idx;
	int src_y1_idx;
	int src_uv_idx;
	src_y0_idx = mad24(y, src_stride, x);
	src_y1_idx = mad24(y+1, src_stride, x);
	src_uv_idx = mad24(y >> 1, src_stride >> 1, x >> 1);

	uchar16 bytes_y0 = vload16(0, in_y + src_y0_idx);
	uchar16 bytes_y1 = vload16(0, in_y + src_y1_idx);

	uchar8 u_src = vload8(0, in_u + in_u_offset + src_uv_idx);
	uchar8 v_src = vload8(0, in_v + in_v_offset + src_uv_idx);
	uchar16 bytes_uv;
	bytes_uv.even = u_src;
	bytes_uv.odd = v_src;

	if(range){
		float16 bytes_y0_f = convert_float16(bytes_y0) * y_range + 16;
		float16 bytes_y1_f = convert_float16(bytes_y1) * y_range + 16;
		bytes_y0 = convert_uchar16_sat_rte(bytes_y0_f);
		bytes_y1 = convert_uchar16_sat_rte(bytes_y1_f);

		float16 bytes_uv_f = convert_float16(bytes_uv) - 128;
		bytes_uv = convert_uchar16_sat_rte(bytes_uv_f * uv_range + 128);
	}

	vstore16(bytes_y0, 0, out_y + dst_y0_idx);
	vstore16(bytes_y1, 0, out_y + dst_y1_idx);
	vstore16(bytes_uv, 0, out_uv + out_uv_offset + dst_uv_idx);
}
__kernel void RGBA_TO_RGB(__global const uchar * input, __global uchar *output,
    int src_stride, int dst_stride)
{
	int x = get_global_id(0)*4;
	int y = get_global_id(1)*2;
	int src_idx = mad24(y, src_stride, x) * 4;
	int src_idx2 = mad24(y+1, src_stride, x) * 4;
	int dst_idx = mad24(y, dst_stride, x) * 3;
	int dst_idx2 = mad24(y+1, dst_stride, x) * 3;

	uchar16 bytes = vload16(0, input + src_idx);
	uchar16 bytes_2 = vload16(0, input + src_idx2);

	vstore8(bytes.s01245689, 0, output + dst_idx);
	vstore4(bytes.sacde, 0, output + dst_idx + 8);

	vstore8(bytes_2.s01245689, 0, output + dst_idx2);
	vstore4(bytes_2.sacde, 0, output + dst_idx2 + 8);
}
typedef struct{
	int width;//right-left
	int height;
	int left;
	int right;
	int top;
	int bottom;
	int stride;
}BufInfo;

__kernel void RGBA_TO_RGB_SCALE(__global const uchar * src,
	__global uchar *dst,
	const BufInfo src_info, const BufInfo dst_info)
{
	int x = get_global_id(0);
	int y = get_global_id(1);

	float2 coordinate = {(float)src_info.width / dst_info.width * x + src_info.left, (float)src_info.height / dst_info.height * y + src_info.top};
	coordinate -= 0.5f;

	int i0 = clamp((int)floor(coordinate.x), src_info.left, src_info.right - 1);
	int j0 = clamp((int)floor(coordinate.y), src_info.top, src_info.bottom - 1);
	int i1 = clamp((int)floor(coordinate.x) + 1, src_info.left, src_info.right - 1);
	int j1 = clamp((int)floor(coordinate.y) + 1, src_info.top, src_info.bottom - 1);

	float a = coordinate.x - floor(coordinate.x);
	float b = coordinate.y - floor(coordinate.y); //x - floor(x)

	int src_idx_0 = mad24(j0, src_info.stride, i0) * 4;
	int src_idx_1 = mad24(j0, src_info.stride, i1) * 4;
	int src_idx_2 = mad24(j1, src_info.stride, i0) * 4;
	int src_idx_3 = mad24(j1, src_info.stride, i1) * 4;

	uchar4 data_0 = vload4(0, src + src_idx_0);
	uchar4 data_1 = vload4(0, src + src_idx_1);
	uchar4 data_2 = vload4(0, src + src_idx_2);
	uchar4 data_3 = vload4(0, src + src_idx_3);

	float3 f_data_0 = convert_float3(data_0.xyz);
	float3 f_data_1 = convert_float3(data_1.xyz);
	float3 f_data_2 = convert_float3(data_2.xyz);
	float3 f_data_3 = convert_float3(data_3.xyz);

	float3 f_rgb = (1 - a) * (1 - b) * f_data_0 + a * (1 - b) * f_data_1 +
			(1 - a) * b * f_data_2 + a * b * f_data_3;

	uchar3 dst_bytes = convert_uchar3_sat_rte(f_rgb);
	int dst_idx = mad24(y + dst_info.top, dst_info.stride, x + dst_info.left) * 3;
	vstore3(dst_bytes, 0, dst + dst_idx);
}
uchar3 read_yuv_from_yuyv(__global const uchar * src, int index){
	uchar3 data;
	if(index & 2){
		data.x = src[index];
		data.y = src[index - 1];
		data.z = src[index + 1];
	}else{
		data.x = src[index];
		data.y = src[index + 1];
		data.z = src[index + 3];
	}
	return data;
}