From 94f8d8523bc4891df5d1d9222581b27e1f6f3eaa Mon Sep 17 00:00:00 2001 From: miha-q <> Date: Mon, 4 Mar 2024 18:33:23 -0500 Subject: [PATCH] Mon Mar 4 06:33:23 PM EST 2024 --- src/complex.c | 12 +++- src/kernel.cl | 42 +++++++------- src/kernel.cl.c | 136 ++++++++++++++++++++++---------------------- src/kernel_cpu.cl.c | 41 +++++++------ 4 files changed, 120 insertions(+), 111 deletions(-) diff --git a/src/complex.c b/src/complex.c index 4aca6ec..7bab0f8 100644 --- a/src/complex.c +++ b/src/complex.c @@ -136,9 +136,13 @@ void cpx_mtx_dot(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, in void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) { int rowsR = rowsA * rowsB; + int colsR = colsA * colsB; for (int i = 0; i < rowsR; i++) { - kernel_knk(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, i); + for (int j = 0; j < colsR; j++) + { + kernel_knk(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, i, j); + } } } @@ -162,9 +166,13 @@ void* cpx_mtx_knk_threads_run(void *context) { cpx_thread_context* ctx = (cpx_thread_context*)context; int rowsR = (ctx->rowsA) * (ctx->rowsB); + int colsR = (ctx->colsA) * (ctx->colsB); for (int i = 0; i < (ctx->delimeterCount); i++) { - kernel_knk(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i + (ctx->delimeterStart)); + for (int j = 0; j < colsR; j++) + { + kernel_knk(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i + (ctx->delimeterStart), j); + } } } diff --git a/src/kernel.cl b/src/kernel.cl index c5935f1..852b538 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -56,34 +56,34 @@ __kernel void kernel_knk const int rowsB, const int colsB //{gpu_only} //const int colsB, {cpu_only} - //const int get_global_id_0 {cpu_only} + //const int get_global_id_0, {cpu_only} + //const int get_global_id_1 {cpu_only} ) { const int rowsR = rowsA * rowsB; const int colsR = colsA * colsB; const int rowR = get_global_id(0); //{gpu_only} + const int colR = get_global_id(1); //{gpu_only} //const int rowR = get_global_id_0; {cpu_only} - for (int colR = 0; colR < colsR; colR++) - { - const int rowA = rowR / rowsB; - const int colA = colR / colsB; - const int rowB = rowR % rowsB; - const int colB = colR % colsB; + //const int colR = get_global_id_1; {cpu_only} + const int rowA = rowR / rowsB; + const int colA = colR / colsB; + const int rowB = rowR % rowsB; + const int colB = colR % colsB; - const int posA = rowA * (colsA * 2) + (colA * 2); - const int posB = rowB * (colsB * 2) + (colB * 2); + const int posA = rowA * (colsA * 2) + (colA * 2); + const int posB = rowB * (colsB * 2) + (colB * 2); - const float rA = ptrA[posA]; - const float iA = ptrA[posA + 1]; - const float rB = ptrB[posB]; - const float iB = ptrB[posB + 1]; + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + const float rB = ptrB[posB]; + const float iB = ptrB[posB + 1]; - //(rA + iA)(rB + iB) - const float first = rA * rB; - const float outer = rA * iB; - const float inner = iA * rB; - const float lasts = iA * iB; - ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; - } + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; } diff --git a/src/kernel.cl.c b/src/kernel.cl.c index 415e3c1..02a9724 100644 --- a/src/kernel.cl.c +++ b/src/kernel.cl.c @@ -129,88 +129,90 @@ unsigned char src_kernel_cl[] = { 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, - 0x5f, 0x30, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, - 0x7d, 0x0a, 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, - 0x52, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x20, 0x2a, 0x20, - 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, - 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, - 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, - 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, - 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x20, 0x2f, + 0x5f, 0x30, 0x2c, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, + 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x31, 0x20, 0x7b, + 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x29, 0x0a, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20, 0x3d, 0x20, + 0x72, 0x6f, 0x77, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x6f, 0x77, 0x73, + 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x3d, + 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x63, 0x6f, 0x6c, + 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, + 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, + 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, + 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, + 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, + 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x30, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, - 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, - 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x30, - 0x3b, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3c, 0x20, 0x63, 0x6f, 0x6c, - 0x73, 0x52, 0x3b, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x2b, 0x2b, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, - 0x72, 0x6f, 0x77, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, - 0x2f, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, - 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x3d, 0x20, 0x63, 0x6f, - 0x6c, 0x52, 0x20, 0x2f, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, + 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, + 0x5f, 0x69, 0x64, 0x5f, 0x31, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, + 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x41, + 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, 0x20, 0x72, 0x6f, + 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, + 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2f, 0x20, 0x63, 0x6f, 0x6c, + 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x25, 0x20, 0x72, 0x6f, 0x77, 0x73, - 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, - 0x42, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, - 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, - 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x41, - 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, - 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x2a, - 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x3d, 0x20, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, + 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, + 0x72, 0x6f, 0x77, 0x41, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, + 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, + 0x6c, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x2a, 0x20, 0x32, - 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, - 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, - 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, - 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, - 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d, + 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x5d, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, + 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, - 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, - 0x31, 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, - 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, + 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, + 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, + 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, + 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, - 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, 0x65, - 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, + 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, + 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, + 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, - 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, - 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, - 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, - 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, - 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, - 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, - 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, - 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, - 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x00 + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, + 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, + 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, + 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, + 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, + 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, + 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, + 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, + 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, + 0x65, 0x72, 0x3b, 0x0a, 0x7d, 0x00 }; -unsigned int src_kernel_cl_len = 2552; +unsigned int src_kernel_cl_len = 2574; diff --git a/src/kernel_cpu.cl.c b/src/kernel_cpu.cl.c index e47ec30..f1c8412 100644 --- a/src/kernel_cpu.cl.c +++ b/src/kernel_cpu.cl.c @@ -52,33 +52,32 @@ void kernel_knk const int colsA, const int rowsB, const int colsB, - const int get_global_id_0 + const int get_global_id_0, + const int get_global_id_1 ) { const int rowsR = rowsA * rowsB; const int colsR = colsA * colsB; const int rowR = get_global_id_0; - for (int colR = 0; colR < colsR; colR++) - { - const int rowA = rowR / rowsB; - const int colA = colR / colsB; - const int rowB = rowR % rowsB; - const int colB = colR % colsB; + const int colR = get_global_id_1; + const int rowA = rowR / rowsB; + const int colA = colR / colsB; + const int rowB = rowR % rowsB; + const int colB = colR % colsB; - const int posA = rowA * (colsA * 2) + (colA * 2); - const int posB = rowB * (colsB * 2) + (colB * 2); + const int posA = rowA * (colsA * 2) + (colA * 2); + const int posB = rowB * (colsB * 2) + (colB * 2); - const float rA = ptrA[posA]; - const float iA = ptrA[posA + 1]; - const float rB = ptrB[posB]; - const float iB = ptrB[posB + 1]; + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + const float rB = ptrB[posB]; + const float iB = ptrB[posB + 1]; - //(rA + iA)(rB + iB) - const float first = rA * rB; - const float outer = rA * iB; - const float inner = iA * rB; - const float lasts = iA * iB; - ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; - } + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; } -- 2.39.5