free(cpx_mtx_cache);
}
-void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared)
+void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
- int colsA = shared;
- int rowsB = shared;
int rowsR = rowsA;
int colsR = colsB;
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 4, sizeof(int), &colsA); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 5, sizeof(int), &rowsB); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 6, sizeof(int), &colsB); gpuerr(clSetKernelArg);
//Run the program
err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR, colsR}, NULL, 0, NULL, NULL);
__global float* ptrA,
__global float* ptrB,
const int rowsA,
- const int colsB,
- const int shared
+ const int colsA,
+ const int rowsB,
+ const int colsB
)
{
- const int colsA = shared;
- const int rowsB = shared;
const int rowsR = rowsA;
const int colsR = colsB;
const int rowR = get_global_id(0);
const int posR = rowR * (colsR * 2) + (colR * 2);
- for (int i = 0; i < shared; i++)
+ for (int i = 0; i < colsA; i++)
{
int posA = rowR * (colsA * 2) + (i * 2);
int posB = i * (colsB * 2) + (colR * 2);