err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg);
//Run the program
- size_t work_size[] = {rowsR, colsR, 4};
- err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 3, NULL, work_size, NULL, 0, NULL, NULL);
+ size_t work_size[] = {rowsR, colsR};
+ err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
gpuerr(clEnqueueNDRangeKernel);
//Wait for completion
{
int rowR = get_global_id(0);
int colR = get_global_id(1);
- int oper = get_global_id(2);
int rowA = rowR / rowsB;
int colA = colR / colsB;
r1 = first + last;
i1 = outer + inner;
- switch (oper)
- {
- case 0: ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; break;
- case 1: ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; break;
- case 2: ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; break;
- case 3: ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; break;
- }
+ ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
+ ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
+ ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
+ ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
}