{
int rowsR = rowsA * rowsB;
int colsR = colsA * colsB;
- for (int i = 0; i < rowsR / 8; i++)
+ for (int i = 0; i < rowsR / (2 * 128); i++)
{
kernel_knk_2x2_Rx4(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
}
err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg);
//Run the program
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 8}, NULL, 0, NULL, NULL);
+ err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / (2 * 128)}, NULL, 0, NULL, NULL);
gpuerr(clEnqueueNDRangeKernel);
//Wait for completion
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x62, 0x6c,
0x6f, 0x63, 0x6b, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c,
0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x20, 0x2a,
- 0x20, 0x38, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f,
- 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f,
- 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20,
- 0x3d, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x20, 0x72, 0x6f, 0x77,
- 0x52, 0x20, 0x3c, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x2b, 0x20,
+ 0x20, 0x32, 0x20, 0x2a, 0x20, 0x31, 0x32, 0x38, 0x3b, 0x20, 0x2f, 0x2f,
+ 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x0a,
+ 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74,
+ 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x62, 0x6c, 0x6f, 0x63,
+ 0x6b, 0x3b, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3c, 0x20, 0x62, 0x6c,
+ 0x6f, 0x63, 0x6b, 0x20, 0x2b, 0x20, 0x32, 0x20, 0x2a, 0x20, 0x31, 0x32,
0x38, 0x3b, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x32,
0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74,
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
0x7d, 0x00
};
-unsigned int kernel_gpu_len = 8930;
+unsigned int kernel_gpu_len = 8942;