From: miha-q <> Date: Thu, 7 Mar 2024 23:48:19 +0000 (-0500) Subject: Thu Mar 7 06:48:19 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=291ab9a6825ef3a58c565351dea83813576030be;p=QAnsel.git Thu Mar 7 06:48:19 PM EST 2024 --- diff --git a/src/.kernel.tmp.1 b/src/.kernel.tmp.1 index 04c0834..1da9581 100644 --- a/src/.kernel.tmp.1 +++ b/src/.kernel.tmp.1 @@ -135,12 +135,12 @@ __kernel void kernel_knk_2x2 ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; break; case 2: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; break; case 3: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; break; } diff --git a/src/.kernel.tmp.2 b/src/.kernel.tmp.2 index 32bfc81..1e791cc 100644 Binary files a/src/.kernel.tmp.2 and b/src/.kernel.tmp.2 differ diff --git a/src/complex.c b/src/complex.c index a9ca17c..d6cbb4d 100644 --- a/src/complex.c +++ b/src/complex.c @@ -227,7 +227,22 @@ void* cpx_mtx_knk_threads_2x2_run(void *context) int colsR = (ctx->colsA) * (ctx->colsB); for (int i = 0; i < (ctx->delimeterCount); i++) { - kernel_knk_2x2(ctx->ptrR, ctx->ptrA, ctx->rowsA, ctx->colsA, ctx->ptrB[0], ctx->ptrB[1], ctx->ptrB[2], ctx->ptrB[3], ctx->ptrB[4], ctx->ptrB[5], ctx->ptrB[6], ctx->ptrB[7], i + (ctx->delimeterStart)); + kernel_knk_2x2 + ( + ctx->ptrR, + ctx->ptrA, + ctx->rowsA, + ctx->colsA, + ctx->ptrB[0], + ctx->ptrB[1], + ctx->ptrB[2], + ctx->ptrB[3], + ctx->ptrB[4], + ctx->ptrB[5], + ctx->ptrB[6], + ctx->ptrB[7], + i + (ctx->delimeterStart) + ); } } @@ -681,7 +696,8 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); //Run the program - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2}, NULL, 0, NULL, NULL); + size_t q = 16; + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2}, &q, 0, NULL, NULL); gpuerr(clEnqueueNDRangeKernel); //Wait for completion diff --git a/src/kernel_cpu.cl b/src/kernel_cpu.cl index 7376958..f8a12f2 100644 --- a/src/kernel_cpu.cl +++ b/src/kernel_cpu.cl @@ -139,12 +139,12 @@ void kernel_knk_2x2 ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; break; case 2: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; break; case 3: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; break; } diff --git a/src/kernel_gpu.cl b/src/kernel_gpu.cl index 6345e20..e5e8787 100644 --- a/src/kernel_gpu.cl +++ b/src/kernel_gpu.cl @@ -332,14 +332,14 @@ unsigned char kernel_gpu[] = { 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x32, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, - 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x28, + 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, - 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, + 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, @@ -351,14 +351,14 @@ unsigned char kernel_gpu[] = { 0x73, 0x65, 0x20, 0x33, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, - 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, + 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, - 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x28, + 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d,