From: miha-q <> Date: Tue, 5 Mar 2024 01:54:51 +0000 (-0500) Subject: Mon Mar 4 08:54:51 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=68f9501f7d381249524ed62bbb91554146bb2b51;p=QAnsel.git Mon Mar 4 08:54:51 PM EST 2024 --- diff --git a/src/QAnsel.c b/src/QAnsel.c index 8284129..99e3f3d 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -228,9 +228,9 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr us2 = get_time(); printf("\tBare: %lu\n", us2 - us1); #else - if (USE_GPU) + if (USE_GPU && 0) //this one's slower for some reason { - cpx_mtx_knk_threads(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); + cpx_mtx_knk_metal(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); } else if (USE_THREADS) { @@ -242,7 +242,6 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr } #endif - free(filter.ptr); filter.ptr = tmp.ptr; filter.rows = tmp.rows; diff --git a/src/complex.c b/src/complex.c index 00eabce..d539e83 100644 --- a/src/complex.c +++ b/src/complex.c @@ -139,10 +139,7 @@ void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, in int colsR = colsA * colsB; for (int i = 0; i < rowsR; i++) { - for (int j = 0; j < colsR; j++) - { - kernel_knk(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, i, j); - } + kernel_knk(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB, i); } } @@ -169,10 +166,7 @@ void* cpx_mtx_knk_threads_run(void *context) int colsR = (ctx->colsA) * (ctx->colsB); for (int i = 0; i < (ctx->delimeterCount); i++) { - for (int j = 0; j < colsR; j++) - { - kernel_knk(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i + (ctx->delimeterStart), j); - } + kernel_knk(ctx->ptrR, ctx->ptrA, ctx->ptrB, ctx->rowsA, ctx->colsA, ctx->rowsB, ctx->colsB, i + (ctx->delimeterStart)); } } diff --git a/src/kernel.cl b/src/kernel.cl index 852b538..2d11a44 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -56,34 +56,34 @@ __kernel void kernel_knk const int rowsB, const int colsB //{gpu_only} //const int colsB, {cpu_only} - //const int get_global_id_0, {cpu_only} - //const int get_global_id_1 {cpu_only} + //const int get_global_id_0{cpu_only} ) { const int rowsR = rowsA * rowsB; const int colsR = colsA * colsB; const int rowR = get_global_id(0); //{gpu_only} - const int colR = get_global_id(1); //{gpu_only} //const int rowR = get_global_id_0; {cpu_only} - //const int colR = get_global_id_1; {cpu_only} - const int rowA = rowR / rowsB; - const int colA = colR / colsB; - const int rowB = rowR % rowsB; - const int colB = colR % colsB; + for (int colR = 0; colR < colsR; colR++) + { + const int rowA = rowR / rowsB; + const int colA = colR / colsB; + const int rowB = rowR % rowsB; + const int colB = colR % colsB; - const int posA = rowA * (colsA * 2) + (colA * 2); - const int posB = rowB * (colsB * 2) + (colB * 2); + const int posA = rowA * (colsA * 2) + (colA * 2); + const int posB = rowB * (colsB * 2) + (colB * 2); - const float rA = ptrA[posA]; - const float iA = ptrA[posA + 1]; - const float rB = ptrB[posB]; - const float iB = ptrB[posB + 1]; + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + const float rB = ptrB[posB]; + const float iB = ptrB[posB + 1]; - //(rA + iA)(rB + iB) - const float first = rA * rB; - const float outer = rA * iB; - const float inner = iA * rB; - const float lasts = iA * iB; - ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + } } diff --git a/src/kernel.cl.c b/src/kernel.cl.c index 02a9724..34ea4ad 100644 --- a/src/kernel.cl.c +++ b/src/kernel.cl.c @@ -129,90 +129,88 @@ unsigned char src_kernel_cl[] = { 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, - 0x5f, 0x30, 0x2c, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, - 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, - 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x31, 0x20, 0x7b, - 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x29, 0x0a, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20, 0x3d, 0x20, - 0x72, 0x6f, 0x77, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x6f, 0x77, 0x73, - 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x3d, - 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x63, 0x6f, 0x6c, - 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, - 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, - 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, - 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, - 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, - 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x20, 0x2f, - 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, - 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, - 0x5f, 0x30, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, - 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, - 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, - 0x5f, 0x69, 0x64, 0x5f, 0x31, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, - 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x41, - 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, 0x20, 0x72, 0x6f, - 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, - 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2f, 0x20, 0x63, 0x6f, 0x6c, - 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, - 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x25, 0x20, 0x72, 0x6f, 0x77, 0x73, - 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x3d, 0x20, - 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, - 0x72, 0x6f, 0x77, 0x41, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, - 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, - 0x6c, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, - 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x2a, - 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, - 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x2a, 0x20, 0x32, - 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d, - 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x5d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, - 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, - 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, - 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, - 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, - 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, - 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, - 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, - 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, - 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x5f, 0x30, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, + 0x0a, 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, + 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x72, + 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, + 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, + 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, + 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, + 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x20, 0x2f, 0x2f, + 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, + 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, + 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, + 0x30, 0x3b, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, + 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, + 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x30, 0x3b, + 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3c, 0x20, 0x63, 0x6f, 0x6c, 0x73, + 0x52, 0x3b, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x2b, 0x2b, 0x29, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, + 0x6f, 0x77, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, + 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, + 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, + 0x52, 0x20, 0x2f, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, 0x20, + 0x72, 0x6f, 0x77, 0x52, 0x20, 0x25, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x42, + 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, 0x6f, + 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x41, 0x20, + 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x2a, 0x20, + 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, + 0x73, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x2a, 0x20, + 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, + 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, + 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, + 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, - 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, - 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, - 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, - 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, - 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, - 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, - 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, - 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, - 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, - 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, - 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, - 0x65, 0x72, 0x3b, 0x0a, 0x7d, 0x00 + 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, + 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, + 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, + 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, + 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, + 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x3d, + 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, + 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, + 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, + 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, + 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, + 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, + 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, + 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, + 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, + 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, + 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x00 }; -unsigned int src_kernel_cl_len = 2574; +unsigned int src_kernel_cl_len = 2551; diff --git a/src/kernel_cpu.cl.c b/src/kernel_cpu.cl.c index f1c8412..434782e 100644 --- a/src/kernel_cpu.cl.c +++ b/src/kernel_cpu.cl.c @@ -52,32 +52,33 @@ void kernel_knk const int colsA, const int rowsB, const int colsB, - const int get_global_id_0, - const int get_global_id_1 + const int get_global_id_0 ) { const int rowsR = rowsA * rowsB; const int colsR = colsA * colsB; const int rowR = get_global_id_0; - const int colR = get_global_id_1; - const int rowA = rowR / rowsB; - const int colA = colR / colsB; - const int rowB = rowR % rowsB; - const int colB = colR % colsB; + for (int colR = 0; colR < colsR; colR++) + { + const int rowA = rowR / rowsB; + const int colA = colR / colsB; + const int rowB = rowR % rowsB; + const int colB = colR % colsB; - const int posA = rowA * (colsA * 2) + (colA * 2); - const int posB = rowB * (colsB * 2) + (colB * 2); + const int posA = rowA * (colsA * 2) + (colA * 2); + const int posB = rowB * (colsB * 2) + (colB * 2); - const float rA = ptrA[posA]; - const float iA = ptrA[posA + 1]; - const float rB = ptrB[posB]; - const float iB = ptrB[posB + 1]; + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + const float rB = ptrB[posB]; + const float iB = ptrB[posB + 1]; - //(rA + iA)(rB + iB) - const float first = rA * rB; - const float outer = rA * iB; - const float inner = iA * rB; - const float lasts = iA * iB; - ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + } }