From: miha-q <> Date: Thu, 7 Mar 2024 23:10:18 +0000 (-0500) Subject: Thu Mar 7 06:10:18 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=91f697972a105735ec4624a77548e5a24207e48a;p=QAnsel.git Thu Mar 7 06:10:18 PM EST 2024 --- diff --git a/src/.kernel.tmp.1 b/src/.kernel.tmp.1 index 253271d..31e3818 100644 --- a/src/.kernel.tmp.1 +++ b/src/.kernel.tmp.1 @@ -235,9 +235,9 @@ __kernel void kernel_knk_2x2_Rx4 { const int rowsR = rowsA * 2; const int colsR = colsA * 2; - const int block = get_global_id(0) * 8; //{gpu_only} + const int block = get_global_id(0) * 2 * 128; //{gpu_only} - for (int rowR = block; rowR < block + 8; rowR += 2) + for (int rowR = block; rowR < block + 2 * 128; rowR += 2) { for (int colR = 0; colR < colsR; colR += 2) { diff --git a/src/.kernel.tmp.2 b/src/.kernel.tmp.2 index 7745e0b..37c5ce4 100644 --- a/src/.kernel.tmp.2 +++ b/src/.kernel.tmp.2 @@ -235,9 +235,9 @@ __kernel void kernel_knk_2x2_Rx4 { const int rowsR = rowsA * 2; const int colsR = colsA * 2; - const int block = get_global_id(0) * 8; //{gpu_only} + const int block = get_global_id(0) * 2 * 128; //{gpu_only} - for (int rowR = block; rowR < block + 8; rowR += 2) + for (int rowR = block; rowR < block + 2 * 128; rowR += 2) { for (int colR = 0; colR < colsR; colR += 2) { diff --git a/src/QAnsel.c b/src/QAnsel.c index 72625f4..b024b7e 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -228,7 +228,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr us2 = get_time(); printf("\tMetal2x2_R: %lu\n", us2 - us1); - if (filter.rows / 8 == 0) + if (filter.rows / 128 == 0) { printf("\tMetal2x2_Rx4: Invalid\n"); } diff --git a/src/complex.c b/src/complex.c index 52508c8..3545ace 100644 --- a/src/complex.c +++ b/src/complex.c @@ -172,7 +172,7 @@ void cpx_mtx_knk_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int c { int rowsR = rowsA * rowsB; int colsR = colsA * colsB; - for (int i = 0; i < rowsR / 8; i++) + for (int i = 0; i < rowsR / (2 * 128); i++) { kernel_knk_2x2_Rx4(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i); } @@ -862,7 +862,7 @@ void cpx_mtx_knk_metal_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); //Run the program - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 8}, NULL, 0, NULL, NULL); + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / (2 * 128)}, NULL, 0, NULL, NULL); gpuerr(clEnqueueNDRangeKernel); //Wait for completion diff --git a/src/kernel.cl b/src/kernel.cl index 9da5ec6..34f291d 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -248,10 +248,10 @@ __kernel void kernel_knk_2x2_Rx4 { const int rowsR = rowsA * 2; const int colsR = colsA * 2; - const int block = get_global_id(0) * 8; //{gpu_only} - const int block = get_global_id_0 * 8; //{cpu_only} + const int block = get_global_id(0) * 2 * 128; //{gpu_only} + const int block = get_global_id_0 * 2 * 128; //{cpu_only} - for (int rowR = block; rowR < block + 8; rowR += 2) + for (int rowR = block; rowR < block + 2 * 128; rowR += 2) { for (int colR = 0; colR < colsR; colR += 2) { diff --git a/src/kernel_cpu.cl b/src/kernel_cpu.cl index f4e692d..e44323f 100644 --- a/src/kernel_cpu.cl +++ b/src/kernel_cpu.cl @@ -242,9 +242,9 @@ void kernel_knk_2x2_Rx4 { const int rowsR = rowsA * 2; const int colsR = colsA * 2; - const int block = get_global_id_0 * 8; //{cpu_only} + const int block = get_global_id_0 * 2 * 128; //{cpu_only} - for (int rowR = block; rowR < block + 8; rowR += 2) + for (int rowR = block; rowR < block + 2 * 128; rowR += 2) { for (int colR = 0; colR < colsR; colR += 2) { diff --git a/src/kernel_gpu.cl b/src/kernel_gpu.cl index 1a494c3..3927dfe 100644 --- a/src/kernel_gpu.cl +++ b/src/kernel_gpu.cl @@ -574,11 +574,12 @@ unsigned char kernel_gpu[] = { 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x20, 0x2a, - 0x20, 0x38, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, - 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, - 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, - 0x3d, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x20, 0x72, 0x6f, 0x77, - 0x52, 0x20, 0x3c, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x20, 0x2b, 0x20, + 0x20, 0x32, 0x20, 0x2a, 0x20, 0x31, 0x32, 0x38, 0x3b, 0x20, 0x2f, 0x2f, + 0x7b, 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, + 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x62, 0x6c, 0x6f, 0x63, + 0x6b, 0x3b, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3c, 0x20, 0x62, 0x6c, + 0x6f, 0x63, 0x6b, 0x20, 0x2b, 0x20, 0x32, 0x20, 0x2a, 0x20, 0x31, 0x32, 0x38, 0x3b, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x32, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, @@ -745,4 +746,4 @@ unsigned char kernel_gpu[] = { 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x00 }; -unsigned int kernel_gpu_len = 8930; +unsigned int kernel_gpu_len = 8942;