err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg);
//Run the program
- size_t work_size[] = {rowsR, colsR};
- err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
+ size_t work_size[] = {rowsR, colsR, 4};
+ err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 3, NULL, work_size, NULL, 0, NULL, NULL);
gpuerr(clEnqueueNDRangeKernel);
//Wait for completion
{
int rowR = get_global_id(0);
int colR = get_global_id(1);
+ int oper = get_global_id(2);
int rowA = rowR / rowsB;
int colA = colR / colsB;
r1 = first + last;
i1 = outer + inner;
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
+ switch (oper)
+ {
+ case 0: ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; break;
+ case 1: ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; break;
+ case 2: ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; break;
+ case 3: ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; break;
+ }
}