void* cpx_mtx_dot_threads_run(void *context)
{
cpx_thread_context* ctx = (cpx_thread_context*)context;
- for (int i = 0; i < (ctx->rowsA); i++)
- {
+ for (int i = 0; i < (ctx->rowsA); i++)
+ {
for (int j = 0; j < (ctx->delimeterCount); j++)
{
kernel_dot(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i, j + (ctx->delimeterStart));
- }
- }
+ }
+ }
}
void cpx_mtx_dot_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
#ifdef __PTHREAD__
- int delimeter = colsB;
+ int delimeter = colsB;
int cores = qansel_get_core_count();
int threadCount = cores;
if (threadCount > delimeter) threadCount = delimeter;
int leftOvers = delimeter % threadCount;
cpx_thread_context ctx = {ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, 0, 0};
cpx_thread_context ctxs[threadCount];
- pthread_t threads[threadCount];
+ pthread_t threads[threadCount];
for (int i = 0; i < threadCount; i++)
{
ctxs[i].ptrR = ctx.ptrR;
exit(1);
}
}
- for (unsigned int i = 0; i < threadCount; i++)
- {
- if (pthread_join(threads[i], NULL))
- {
- fprintf(stderr, "QAnsel: Thread error. (2)\n");
- }
- }
+ for (unsigned int i = 0; i < threadCount; i++)
+ {
+ if (pthread_join(threads[i], NULL))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (2)\n");
+ }
+ }
#else
cpx_mtx_dot_threads(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB);
#endif
#ifdef __OPENCL__
cl_uint count;
cl_int err;
-
+
err = clGetPlatformIDs(1, &cpx_mtx_platform_id, &count);
if (err != CL_SUCCESS || count == 0)
{
void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
#ifdef __OPENCL__
- int rowsR = rowsA;
- int colsR = colsB;
+ int rowsR = rowsA;
+ int colsR = colsB;
//Create buffers
size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err);
cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(err);
cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(err);
-
+
//Populate buffers
err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); gpuerr(err);
err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); gpuerr(err);
err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR, colsR}, NULL, 0, NULL, NULL);
gpuerr(err);
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(err);
- err = clFinish(cpx_mtx_command_queue); gpuerr(err);
-
//Read results
err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
gpuerr(err);
+ //Wait for completion
+ err = clFlush(cpx_mtx_command_queue); gpuerr(err);
+ err = clFinish(cpx_mtx_command_queue); gpuerr(err);
+
//Clean up
err = clReleaseKernel(kernel); gpuerr(err);
err = clReleaseProgram(program); gpuerr(err);
cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err);
cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(err);
cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(err);
-
+
//Populate buffers
unsigned long long int q = qansel_get_time();
err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
- gpuerr(err);
+ gpuerr(err);
err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
- gpuerr(err);
-
+ gpuerr(err);
+
//Load and compile program
cl_program program;
if (cpx_mtx_cache == NULL)
err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR}, NULL, 0, NULL, NULL);
gpuerr(err);
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(err);
- err = clFinish(cpx_mtx_command_queue); gpuerr(err);
-
//Read results
err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
gpuerr(err);
+ //Wait for completion
+ err = clFlush(cpx_mtx_command_queue); gpuerr(err);
+ err = clFinish(cpx_mtx_command_queue); gpuerr(err);
+
//Clean up
err = clReleaseKernel(kernel); gpuerr(err);
err = clReleaseProgram(program); gpuerr(err);
//This only works if ptrA is NxM where both N and X are divisible by two,
// and ptrB is 2x2. If both are true, this is much more efficient than
-// the standard knk_metal() function.
+// the standard knk_metal() function.
void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
#ifdef __OPENCL__
cl_int err;
cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err);
cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(err);
-
+
//Populate buffers
err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); gpuerr(err);
err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2}, (size_t[]){2}, 0, NULL, NULL);
gpuerr(err);
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(err);
- err = clFinish(cpx_mtx_command_queue); gpuerr(err);
-
//Read results
err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
gpuerr(err);
+ //Wait for completion
+ err = clFlush(cpx_mtx_command_queue); gpuerr(err);
+ err = clFinish(cpx_mtx_command_queue); gpuerr(err);
+
//Clean up
err = clReleaseKernel(kernel); gpuerr(err);
err = clReleaseProgram(program); gpuerr(err);