From 764ba2e08be53ffdc62ffdbf84adee09e1397efe Mon Sep 17 00:00:00 2001 From: miha-q <> Date: Mon, 4 Mar 2024 11:10:57 -0500 Subject: [PATCH] Mon Mar 4 11:10:57 AM EST 2024 --- src/complex.c | 26 +++++++++++++------------- src/kernel.cl | 39 ++++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 32 deletions(-) diff --git a/src/complex.c b/src/complex.c index 2b2ddd9..e751076 100644 --- a/src/complex.c +++ b/src/complex.c @@ -6,7 +6,6 @@ #include #include #include "cores.c" -#define GPU_ENABLED typedef struct { float real, imaginary; @@ -122,7 +121,6 @@ void cpx_mtx_print(cpx_mtx_t* m) } } -//This is for testing GPU functions on the CPU #define __kernel #define __global int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2; @@ -163,7 +161,12 @@ void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, in } } -#ifdef GPU_ENABLED +/*THREADED*/ + +/*-----------------------------------------------------------------------------------*/ + +/*METAL*/ +/*-----------------------------------------------------------------------------------*/ #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_TARGET_OPENCL_VERSION 300 #include @@ -175,11 +178,9 @@ cl_context cpx_mtx_context; cl_command_queue cpx_mtx_command_queue; unsigned char* cpx_mtx_cache = NULL; size_t cpx_mtx_cache_len = 0; -#endif uint8_t cpx_mtx_begin() { - #ifdef GPU_ENABLED cl_uint count; cl_int err; @@ -219,13 +220,11 @@ uint8_t cpx_mtx_begin() fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); return 0; } - #endif return 1; } void cpx_mtx_clean() { - #ifdef GPU_ENABLED cl_int err; err = clReleaseCommandQueue(cpx_mtx_command_queue); if (err != CL_SUCCESS) @@ -238,10 +237,8 @@ void cpx_mtx_clean() fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); } free(cpx_mtx_cache); - #endif } -#ifdef GPU_ENABLED void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared) { int colsA = shared; @@ -387,8 +384,8 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int col err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg); //Run the program - size_t work_size[] = {rowsR, colsR}; - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); + size_t work_size[] = {rowsR}; + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL); gpuerr(clEnqueueNDRangeKernel); //Wait for completion @@ -406,6 +403,9 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int col err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); } -#endif +/*-----------------------------------------------------------------------------------*/ + + + -#endif +#endif \ No newline at end of file diff --git a/src/kernel.cl b/src/kernel.cl index c19c7a5..df3af2e 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -57,27 +57,28 @@ __kernel void kernel_knk ) { int rowR = get_global_id(0); - int colR = get_global_id(1); - - int rowA = rowR / rowsB; - int colA = colR / colsB; - int rowB = rowR % rowsB; - int colB = colR % colsB; + for (int colR = 0; colR < colsR; colR++) + { + int rowA = rowR / rowsB; + int colA = colR / colsB; + int rowB = rowR % rowsB; + int colB = colR % colsB; - int posA = rowA * (colsA * 2) + (colA * 2); - int posB = rowB * (colsB * 2) + (colB * 2); + int posA = rowA * (colsA * 2) + (colA * 2); + int posB = rowB * (colsB * 2) + (colB * 2); - float rA = ptrA[posA]; - float iA = ptrA[posA + 1]; - float rB = ptrB[posB]; - float iB = ptrB[posB + 1]; + float rA = ptrA[posA]; + float iA = ptrA[posA + 1]; + float rB = ptrB[posB]; + float iB = ptrB[posB + 1]; - //(rA + iA)(rB + iB) - float first = rA * rB; - float outer = rA * iB; - float inner = iA * rB; - float lasts = iA * iB; - ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + //(rA + iA)(rB + iB) + float first = rA * rB; + float outer = rA * iB; + float inner = iA * rB; + float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + } } -- 2.39.5