//Create buffers
size_t sizeA = rowsA * shared;
size_t sizeB = shared * colsB;
- size_t sizeR = shared * shared;
+ size_t sizeR = rowsA * colsB;
cl_int err;
cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err);
if (err != CL_SUCCESS)
exit(1);
}
//Run the program
- size_t work_size[] = {shared, shared};
+ size_t work_size[] = {rowsA, colsB};
err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
int row = get_global_id(0);
int col = get_global_id(1);
float sum = 0;
- for (int i = 0; i < shared; i++)
+ for (int i = 0; i < rowsA; i++)
{
sum += ptrA[row * colsA + i] * ptrB[i * colsB + col];
}
- ptrR[row * shared + col] = sum;
+ ptrR[row * colsB + col] = sum;
}
\ No newline at end of file