From f3ce650274df1e2b4fbcf08b91ecfd9c61af57bb Mon Sep 17 00:00:00 2001 From: miha-q <> Date: Mon, 20 Jan 2025 00:00:08 -0500 Subject: [PATCH] Mon Jan 20 12:00:08 AM EST 2025 --- src/complex.c | 70 +++++++++++++++++++++++++-------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/src/complex.c b/src/complex.c index 8916788..3ad361a 100644 --- a/src/complex.c +++ b/src/complex.c @@ -299,19 +299,19 @@ void cpx_mtx_knk_threads_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, i void* cpx_mtx_dot_threads_run(void *context) { cpx_thread_context* ctx = (cpx_thread_context*)context; - for (int i = 0; i < (ctx->rowsA); i++) - { + for (int i = 0; i < (ctx->rowsA); i++) + { for (int j = 0; j < (ctx->delimeterCount); j++) { kernel_dot(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i, j + (ctx->delimeterStart)); - } - } + } + } } void cpx_mtx_dot_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) { #ifdef __PTHREAD__ - int delimeter = colsB; + int delimeter = colsB; int cores = qansel_get_core_count(); int threadCount = cores; if (threadCount > delimeter) threadCount = delimeter; @@ -319,7 +319,7 @@ void cpx_mtx_dot_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int c int leftOvers = delimeter % threadCount; cpx_thread_context ctx = {ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, 0, 0}; cpx_thread_context ctxs[threadCount]; - pthread_t threads[threadCount]; + pthread_t threads[threadCount]; for (int i = 0; i < threadCount; i++) { ctxs[i].ptrR = ctx.ptrR; @@ -338,13 +338,13 @@ void cpx_mtx_dot_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int c exit(1); } } - for (unsigned int i = 0; i < threadCount; i++) - { - if (pthread_join(threads[i], NULL)) - { - fprintf(stderr, "QAnsel: Thread error. (2)\n"); - } - } + for (unsigned int i = 0; i < threadCount; i++) + { + if (pthread_join(threads[i], NULL)) + { + fprintf(stderr, "QAnsel: Thread error. (2)\n"); + } + } #else cpx_mtx_dot_threads(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB); #endif @@ -431,7 +431,7 @@ unsigned char cpx_mtx_begin(unsigned char verbose) #ifdef __OPENCL__ cl_uint count; cl_int err; - + err = clGetPlatformIDs(1, &cpx_mtx_platform_id, &count); if (err != CL_SUCCESS || count == 0) { @@ -511,8 +511,8 @@ void cpx_mtx_clean() void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) { #ifdef __OPENCL__ - int rowsR = rowsA; - int colsR = colsB; + int rowsR = rowsA; + int colsR = colsB; //Create buffers size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); @@ -522,7 +522,7 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err); cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(err); cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(err); - + //Populate buffers err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); gpuerr(err); err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); gpuerr(err); @@ -578,14 +578,14 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR, colsR}, NULL, 0, NULL, NULL); gpuerr(err); - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(err); - err = clFinish(cpx_mtx_command_queue); gpuerr(err); - //Read results err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); gpuerr(err); + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(err); + err = clFinish(cpx_mtx_command_queue); gpuerr(err); + //Clean up err = clReleaseKernel(kernel); gpuerr(err); err = clReleaseProgram(program); gpuerr(err); @@ -610,14 +610,14 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err); cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(err); cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(err); - + //Populate buffers unsigned long long int q = qansel_get_time(); err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(err); + gpuerr(err); err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); - gpuerr(err); - + gpuerr(err); + //Load and compile program cl_program program; if (cpx_mtx_cache == NULL) @@ -674,14 +674,14 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR}, NULL, 0, NULL, NULL); gpuerr(err); - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(err); - err = clFinish(cpx_mtx_command_queue); gpuerr(err); - //Read results err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); gpuerr(err); + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(err); + err = clFinish(cpx_mtx_command_queue); gpuerr(err); + //Clean up err = clReleaseKernel(kernel); gpuerr(err); err = clReleaseProgram(program); gpuerr(err); @@ -695,7 +695,7 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col //This only works if ptrA is NxM where both N and X are divisible by two, // and ptrB is 2x2. If both are true, this is much more efficient than -// the standard knk_metal() function. +// the standard knk_metal() function. void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) { #ifdef __OPENCL__ @@ -707,7 +707,7 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int cl_int err; cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err); cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(err); - + //Populate buffers err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); gpuerr(err); @@ -781,14 +781,14 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2}, (size_t[]){2}, 0, NULL, NULL); gpuerr(err); - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(err); - err = clFinish(cpx_mtx_command_queue); gpuerr(err); - //Read results err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); gpuerr(err); + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(err); + err = clFinish(cpx_mtx_command_queue); gpuerr(err); + //Clean up err = clReleaseKernel(kernel); gpuerr(err); err = clReleaseProgram(program); gpuerr(err); -- 2.39.5