From: miha-q <> Date: Mon, 4 Mar 2024 22:05:11 +0000 (-0500) Subject: Mon Mar 4 05:05:11 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=77dae6b26fb78cb7a5da1a4ebf4fdf9a4cac4ab4;p=QAnsel.git Mon Mar 4 05:05:11 PM EST 2024 --- diff --git a/Makefile b/Makefile index 99b4eae..bcf674e 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ all: bash -c 'echo -ne "$$(cat src/.kernel.cl)\x00" > src/kernel.cl' xxd -i src/kernel.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/kernel.cl.c mv src/.kernel.cl src/kernel.cl + cat src/kernel.cl | grep -vi '{gpu_only}' | sed -e 's/\/\/\(.*\){cpu_only}/\1/' -e 's/__global //' -e 's/__kernel //' > src/kernel_cpu.cl.c gcc src/QAnsel.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread - rm -f src/*.cl.c + #rm -f src/*.cl.c diff --git a/src/QAnsel.c b/src/QAnsel.c index 5769cc3..cc87701 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -11,6 +11,7 @@ unsigned char HIDDEN_VARIABLE = 0; FILE* RANDOM_FILE; #define GPU_ENABLED unsigned char USE_GPU = 0; +unsigned char USE_THREADS = 1; typedef struct { @@ -47,7 +48,6 @@ float qansel_rand_t() } } - float qansel_rand() { return HIDDEN_VARIABLE ? qansel_rand_h() : qansel_rand_t(); @@ -59,7 +59,7 @@ void qansel_cnot(cpx_mtx_t* stateVector, unsigned char qubitCount, unsigned char cpx_mtx_t ret; cpx_mtx_init(&ret, 1, retLen); cpx_t n; - for (unsigned int i = 0; i < retLen; i++) //asdfasdfsdfsdf + for (unsigned int i = 0; i < retLen; i++) { unsigned char bitAVal = (i >> bitA) & 1; unsigned char bitBVal = (i >> bitB) & 1; @@ -189,12 +189,10 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr unsigned char qubit = qubitCount - (instr->q0) - 1; if (qubit == 0) { - //memcpy(filter.ptr, gate_ptr, 16 * sizeof(float)); memcpy(filter.ptr, gate_ptr, 8 * sizeof(float)); } else { - //memcpy(filter.ptr, Identity, 16 * sizeof(float)); memcpy(filter.ptr, Identity, 8 * sizeof(float)); } @@ -213,18 +211,18 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr tmp.cols = filter.cols * gate.cols; tmp.ptr = malloc(tmp.rows * (tmp.cols * 2) * sizeof(float)); - #ifdef GPU_ENABLED - if (USE_GPU && (tmp.rows >= 1024 && tmp.cols >= 1024)) + if (USE_GPU && 0) { cpx_mtx_knk_metal(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); } + else if (USE_THREADS) + { + cpx_mtx_knk_threads(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); + } else { cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); } - #else - cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); - #endif free(filter.ptr); @@ -234,22 +232,18 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr } cpx_mtx_init(&tmp, stateVector->rows, stateVector->cols); - #ifdef GPU_ENABLED if (USE_GPU) { cpx_mtx_dot_metal(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols); } + else if (USE_THREADS) + { + cpx_mtx_dot_threads(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols); + } else { cpx_mtx_dot(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols); } - #else - cpx_mtx_dot - ( - tmp.ptr, stateVector->ptr, filter.ptr, - stateVector->rows, filter.cols, stateVector->cols - ); - #endif free(stateVector->ptr); stateVector->ptr = tmp.ptr; free(filter.ptr); @@ -1369,6 +1363,6 @@ void main(int argc, char** argv) if (!RANDOM_FILE) RANDOM_FILE = fopen("/dev/random", "r"); process(argc, argv); fclose(RANDOM_FILE); - - cpx_mtx_clean(); + printf(">%i<\n", get_core_count()); + if (USE_GPU) cpx_mtx_clean(); } \ No newline at end of file diff --git a/src/complex.c b/src/complex.c index d6e5610..3688fae 100644 --- a/src/complex.c +++ b/src/complex.c @@ -6,6 +6,7 @@ #include #include #include "cores.c" +#include "kernel_cpu.cl.c" typedef struct { float real, imaginary; @@ -121,29 +122,13 @@ void cpx_mtx_print(cpx_mtx_t* m) } } -#define __kernel -#define __global -int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2; -int get_global_id(int id) -{ - switch (id) - { - case 0: return GPU_GLOBAL_ID_0; - case 1: return GPU_GLOBAL_ID_1; - case 2: return GPU_GLOBAL_ID_2; - } -} -#include "kernel.cl" - void cpx_mtx_dot(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) { for (int i = 0; i < rowsA; i++) { for (int j = 0; j < colsB; j++) { - GPU_GLOBAL_ID_0 = i; - GPU_GLOBAL_ID_1 = j; - kernel_dot(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB); + kernel_dot(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, i, j); } } } @@ -153,15 +138,125 @@ void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, in int rowsR = rowsA * rowsB; for (int i = 0; i < rowsR; i++) { - GPU_GLOBAL_ID_0 = i; - kernel_knk(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB); + kernel_knk(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, i); } } +/*-----------------------------------------------------------------------------------*/ /*THREADED*/ - /*-----------------------------------------------------------------------------------*/ +typedef struct +{ + float* ptrR; + float* ptrA; + float* ptrB; + int rowsA; + int colsA; + int rowsB; + int colsB; + int delimeterStart; + int delimeterCount; +} cpx_thread_context; + +void* cpx_mtx_knk_threads_run(void *context) +{ + cpx_thread_context* ctx = (cpx_thread_context*)context; + int rowsR = (ctx->rowsA) * (ctx->rowsB); + for (int i = 0; i < (ctx->delimeterCount); i++) + { + kernel_knk(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i + (ctx->delimeterStart)); + } +} + +void cpx_mtx_knk_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) +{ + int delimeter = rowsA * rowsB; + int cores = get_core_count(); + int threadCount = cores; + if (threadCount > delimeter) threadCount = delimeter; + int delimetersPerThread = delimeter / threadCount; + int leftOvers = delimeter % threadCount; + + cpx_thread_context ctx = {ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, 0, 0}; + cpx_thread_context ctxs[threadCount]; + pthread_t threads[threadCount]; + for (int i = 0; i < threadCount; i++) + { + ctxs[i].ptrR = ctx.ptrR; + ctxs[i].ptrA = ctx.ptrA; + ctxs[i].ptrB = ctx.ptrB; + ctxs[i].rowsA = ctx.rowsA; + ctxs[i].colsA = ctx.colsA; + ctxs[i].rowsB = ctx.rowsB; + ctxs[i].colsB = ctx.colsB; + ctxs[i].delimeterStart = i * delimetersPerThread; + ctxs[i].delimeterCount = delimetersPerThread + ((i == threadCount - 1) ? leftOvers : 0); + + if (pthread_create(&(threads[i]), NULL, &cpx_mtx_knk_threads_run, (void*)&(ctxs[i]))) + { + fprintf(stderr, "QAnsel: Thread error. (1)\n"); + exit(1); + } + } + for (uint32_t i = 0; i < threadCount; i++) + { + if (pthread_join(threads[i], NULL)) + { + fprintf(stderr, "QAnsel: Thread error. (2)\n"); + } + } +} +void* cpx_mtx_dot_threads_run(void *context) +{ + cpx_thread_context* ctx = (cpx_thread_context*)context; + for (int i = 0; i < (ctx->rowsA); i++) + { + for (int j = 0; j < (ctx->delimeterCount); j++) + { + kernel_dot(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i, j + (ctx->delimeterStart)); + } + } +} + +void cpx_mtx_dot_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) +{ + int delimeter = colsB; + int cores = get_core_count(); + int threadCount = cores; + if (threadCount > delimeter) threadCount = delimeter; + int delimeterPerThread = delimeter / threadCount; + int leftOvers = delimeter % threadCount; + cpx_thread_context ctx = {ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, 0, 0}; + cpx_thread_context ctxs[threadCount]; + pthread_t threads[threadCount]; + for (int i = 0; i < threadCount; i++) + { + ctxs[i].ptrR = ctx.ptrR; + ctxs[i].ptrA = ctx.ptrA; + ctxs[i].ptrB = ctx.ptrB; + ctxs[i].rowsA = ctx.rowsA; + ctxs[i].colsA = ctx.colsA; + ctxs[i].rowsB = ctx.rowsB; + ctxs[i].colsB = ctx.colsB; + ctxs[i].delimeterStart = i * delimeterPerThread; + ctxs[i].delimeterCount = delimeterPerThread + ((i == threadCount - 1) ? leftOvers : 0); + + if (pthread_create(&(threads[i]), NULL, &cpx_mtx_dot_threads_run, (void*)&(ctxs[i]))) + { + fprintf(stderr, "QAnsel: Thread error. (1)\n"); + exit(1); + } + } + for (uint32_t i = 0; i < threadCount; i++) + { + if (pthread_join(threads[i], NULL)) + { + fprintf(stderr, "QAnsel: Thread error. (2)\n"); + } + } +} +/*-----------------------------------------------------------------------------------*/ /*METAL*/ /*-----------------------------------------------------------------------------------*/ #define CL_USE_DEPRECATED_OPENCL_1_2_APIS @@ -291,7 +386,7 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col //Setup kernel cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernfelArg); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg); diff --git a/src/kernel.cl b/src/kernel.cl index 2f0c8a6..c5935f1 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -6,13 +6,18 @@ __kernel void kernel_dot const int rowsA, const int colsA, const int rowsB, - const int colsB + const int colsB //{gpu_only} + //const int colsB, {cpu_only} + //const int get_global_id_0, {cpu_only} + //const int get_global_id_1 {cpu_only} ) { const int rowsR = rowsA; const int colsR = colsB; - const int rowR = get_global_id(0); - const int colR = get_global_id(1); + const int rowR = get_global_id(0); //{gpu_only} + const int colR = get_global_id(1); //{gpu_only} + //const int rowR = get_global_id_0; {cpu_only} + //const int colR = get_global_id_1; {cpu_only} int posA, posB; float rR = 0; float iR = 0; @@ -49,34 +54,36 @@ __kernel void kernel_knk const int rowsA, const int colsA, const int rowsB, - const int colsB + const int colsB //{gpu_only} + //const int colsB, {cpu_only} + //const int get_global_id_0 {cpu_only} ) { const int rowsR = rowsA * rowsB; const int colsR = colsA * colsB; - int rowR = get_global_id(0); + const int rowR = get_global_id(0); //{gpu_only} + //const int rowR = get_global_id_0; {cpu_only} for (int colR = 0; colR < colsR; colR++) { - int rowA = rowR / rowsB; - int colA = colR / colsB; - int rowB = rowR % rowsB; - int colB = colR % colsB; + const int rowA = rowR / rowsB; + const int colA = colR / colsB; + const int rowB = rowR % rowsB; + const int colB = colR % colsB; - int posA = rowA * (colsA * 2) + (colA * 2); - int posB = rowB * (colsB * 2) + (colB * 2); + const int posA = rowA * (colsA * 2) + (colA * 2); + const int posB = rowB * (colsB * 2) + (colB * 2); - float rA = ptrA[posA]; - float iA = ptrA[posA + 1]; - float rB = ptrB[posB]; - float iB = ptrB[posB + 1]; + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + const float rB = ptrB[posB]; + const float iB = ptrB[posB + 1]; //(rA + iA)(rB + iB) - float first = rA * rB; - float outer = rA * iB; - float inner = iA * rB; - float lasts = iA * iB; + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; } } - diff --git a/src/kernel.cl.c b/src/kernel.cl.c new file mode 100644 index 0000000..415e3c1 --- /dev/null +++ b/src/kernel.cl.c @@ -0,0 +1,216 @@ +unsigned char src_kernel_cl[] = { + 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x64, 0x6f, 0x74, + 0x0a, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, + 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, + 0x74, 0x72, 0x52, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, + 0x20, 0x70, 0x74, 0x72, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, + 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, + 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, + 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, + 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, + 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2f, 0x2f, 0x7b, 0x67, + 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, + 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x2c, 0x20, 0x7b, 0x63, 0x70, 0x75, + 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, + 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, + 0x5f, 0x30, 0x2c, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, + 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x31, 0x20, 0x7b, + 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x29, 0x0a, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20, 0x3d, 0x20, + 0x72, 0x6f, 0x77, 0x73, 0x41, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, + 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, + 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, + 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, + 0x29, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, + 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, + 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, + 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, + 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x30, 0x3b, 0x20, + 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, + 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, + 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, + 0x31, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, + 0x73, 0x41, 0x2c, 0x20, 0x70, 0x6f, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x52, 0x20, 0x3d, + 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, + 0x74, 0x20, 0x69, 0x52, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x70, 0x6f, 0x73, 0x52, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, + 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, + 0x32, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, + 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, + 0x20, 0x69, 0x20, 0x3c, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x3b, 0x20, + 0x69, 0x2b, 0x2b, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, + 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x28, 0x69, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x70, 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x69, 0x20, 0x2a, 0x20, 0x28, + 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, + 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, + 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, + 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, + 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, + 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, + 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, + 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d, + 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, + 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, + 0x72, 0x73, 0x74, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, + 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x3d, + 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, + 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, + 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x52, 0x20, + 0x2b, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, + 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x69, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, + 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, + 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, + 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, + 0x20, 0x72, 0x52, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, + 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, + 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, + 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x69, 0x52, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, + 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, + 0x64, 0x20, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x6b, 0x6e, 0x6b, + 0x0a, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, + 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, + 0x74, 0x72, 0x52, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, + 0x20, 0x70, 0x74, 0x72, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, + 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, + 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, + 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, + 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, + 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2f, 0x2f, 0x7b, 0x67, + 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, + 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x2c, 0x20, 0x7b, 0x63, 0x70, 0x75, + 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, + 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, + 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, + 0x5f, 0x30, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, + 0x7d, 0x0a, 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, + 0x52, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x20, 0x2a, 0x20, + 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, + 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, + 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, + 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, + 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x20, 0x2f, + 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, + 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, + 0x5f, 0x30, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, + 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, + 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x30, + 0x3b, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3c, 0x20, 0x63, 0x6f, 0x6c, + 0x73, 0x52, 0x3b, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x2b, 0x2b, 0x29, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x72, 0x6f, 0x77, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, + 0x2f, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, + 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x3d, 0x20, 0x63, 0x6f, + 0x6c, 0x52, 0x20, 0x2f, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, + 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x25, 0x20, 0x72, 0x6f, 0x77, 0x73, + 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, + 0x42, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, + 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, + 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x41, + 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, + 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x2a, + 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, + 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x2a, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, + 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, + 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, + 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, + 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, + 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, + 0x31, 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, + 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, + 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, + 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, 0x65, + 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, + 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, + 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, + 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, + 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, + 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, + 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, + 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x00 +}; +unsigned int src_kernel_cl_len = 2552; diff --git a/src/kernel_cpu.cl.c b/src/kernel_cpu.cl.c new file mode 100644 index 0000000..e47ec30 --- /dev/null +++ b/src/kernel_cpu.cl.c @@ -0,0 +1,84 @@ +void kernel_dot +( + float* ptrR, + float* ptrA, + float* ptrB, + const int rowsA, + const int colsA, + const int rowsB, + const int colsB, + const int get_global_id_0, + const int get_global_id_1 +) +{ + const int rowsR = rowsA; + const int colsR = colsB; + const int rowR = get_global_id_0; + const int colR = get_global_id_1; + int posA, posB; + float rR = 0; + float iR = 0; + const int posR = rowR * (colsR * 2) + (colR * 2); + + for (int i = 0; i < colsA; i++) + { + int posA = rowR * (colsA * 2) + (i * 2); + int posB = i * (colsB * 2) + (colR * 2); + + float rA = ptrA[posA]; + float iA = ptrA[posA + 1]; + float rB = ptrB[posB]; + float iB = ptrB[posB + 1]; + + //(rA + iA)(rB + iB) + float first = rA * rB; + float outer = rA * iB; + float inner = iA * rB; + float lasts = iA * iB; + + rR += first + lasts; + iR += outer + inner; + } + ptrR[rowR * (colsR * 2) + (colR * 2)] = rR; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR; +} + +void kernel_knk +( + float* ptrR, + float* ptrA, + float* ptrB, + const int rowsA, + const int colsA, + const int rowsB, + const int colsB, + const int get_global_id_0 +) +{ + const int rowsR = rowsA * rowsB; + const int colsR = colsA * colsB; + const int rowR = get_global_id_0; + for (int colR = 0; colR < colsR; colR++) + { + const int rowA = rowR / rowsB; + const int colA = colR / colsB; + const int rowB = rowR % rowsB; + const int colB = colR % colsB; + + const int posA = rowA * (colsA * 2) + (colA * 2); + const int posB = rowB * (colsB * 2) + (colB * 2); + + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + const float rB = ptrB[posB]; + const float iB = ptrB[posB + 1]; + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + } +}