From 21a961a0faad4c94c3bc094e439b3509efecaeed Mon Sep 17 00:00:00 2001 From: miha-q <> Date: Tue, 5 Mar 2024 21:59:26 -0500 Subject: [PATCH] Tue Mar 5 09:59:26 PM EST 2024 --- src/QAnsel.c | 21 +++++--- src/complex.c | 102 +++++++++++++++++++++++++++++++++++ src/kernel.cl | 50 ++++++++++++++++++ src/kernel.cl.c | 126 +++++++++++++++++++++++++++++++++++++++++++- src/kernel_cpu.cl.c | 49 +++++++++++++++++ 5 files changed, 340 insertions(+), 8 deletions(-) diff --git a/src/QAnsel.c b/src/QAnsel.c index 833970a..8c2fc6d 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -213,26 +213,35 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr tmp.ptr = malloc(tmp.rows * (tmp.cols * 2) * sizeof(float)); #ifdef SPEED_TEST - printf("%ix%i (knk)\n", tmp.rows, tmp.cols); + printf("(%ix%i);(%ix%i) (knk)\n", tmp.rows, tmp.cols, gate.rows, gate.cols); unsigned long int us1, us2; us1 = get_time(); cpx_mtx_knk_metal(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); us2 = get_time(); printf("\tMetal: %lu\n", us2 - us1); us1 = get_time(); + cpx_mtx_knk_metal_2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); + us2 = get_time(); + printf("\tMetal2: %lu\n", us2 - us1); + us1 = get_time(); cpx_mtx_knk_threads(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); us2 = get_time(); printf("\tThreads: %lu\n", us2 - us1); us1 = get_time(); cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); us2 = get_time(); - printf("\tBare(1): %lu\n", us2 - us1); + printf("\tBare: %lu\n", us2 - us1); + + //us1 = get_time(); + //cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); + //us2 = get_time(); + //printf("\tTranspose: %lu\n", us2 - us1); #else - if (USE_GPU) //this one's slower for some reason + if (USE_GPU && tmp.rows >= 512) { - cpx_mtx_knk_metal(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); + cpx_mtx_knk_metal_2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); } - else if (USE_THREADS) + else if (USE_THREADS && tmp.rows >= 512) { cpx_mtx_knk_threads(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); } @@ -270,7 +279,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr { cpx_mtx_dot_metal(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols); } - else if (USE_THREADS) + else if (USE_THREADS && tmp.rows >= 512) { cpx_mtx_dot_threads(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols); } diff --git a/src/complex.c b/src/complex.c index e1eb993..e4af56b 100644 --- a/src/complex.c +++ b/src/complex.c @@ -548,6 +548,108 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); } + + + +void cpx_mtx_knk_metal_2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) +{ + int rowsR = rowsA * rowsB; + int colsR = colsA * colsB; + //Create buffers + size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); + size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); + cl_int err; + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); + + //Populate buffers + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); + gpuerr(clEnqueueWriteBuffer); + + //Load and compile program + cl_program program; + if (cpx_mtx_cache == NULL) + { + program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel_cl}, NULL, &err); + gpuerr(clCreateProgramWithSource); + err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); + gpuerr(clGetProgramInfo); + cpx_mtx_cache = malloc(cpx_mtx_cache_len); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); + gpuerr(clGetProgramInfo); + } + else + { + program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); + gpuerr(clCreateProgramWithBinary); + err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + } + + //Setup kernel + + float gate0 = ptrB[0]; + float gate1 = ptrB[1]; + float gate2 = ptrB[2]; + float gate3 = ptrB[3]; + float gate4 = ptrB[4]; + float gate5 = ptrB[5]; + float gate6 = ptrB[6]; + float gate7 = ptrB[7]; + cl_kernel kernel = clCreateKernel(program, "kernel_knk_3", &err); gpuerr(clCreateKernel); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); + + //Run the program + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR}, NULL, 0, NULL, NULL); + gpuerr(clEnqueueNDRangeKernel); + + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); + err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + + //Read results + err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); + gpuerr(clEnqueueReadBuffer); + + //Clean up + err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); + err = clReleaseProgram(program); gpuerr(clReleaseProgram); + err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); + err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); +} /*-----------------------------------------------------------------------------------*/ diff --git a/src/kernel.cl b/src/kernel.cl index 37d178f..864c1b8 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -167,3 +167,53 @@ __kernel void kernel_knk_2 ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR; } + +__kernel void kernel_knk_3 +( + __global float* ptrR, + __global float* ptrA, + const int rowsA, + const int colsA, + const float gate0, + const float gate1, + const float gate2, + const float gate3, + const float gate4, + const float gate5, + const float gate6, + const float gate7 + //, const int get_global_id_0 {cpu_only} +) +{ + const int rowsR = rowsA * 2; + const int colsR = colsA * 2; + const int rowR = get_global_id(0); //{gpu_only} + //const int rowR = get_global_id_0; {cpu_only} + for (int colR = 0; colR < colsR; colR++) + { + const int rowA = rowR / 2; + const int colA = colR / 2; + const int rowB = rowR % 2; + const int colB = colR % 2; + float rB, iB; + switch ((rowB << 1) | colB) + { + case 0b00: rB = gate0; iB = gate1; break; + case 0b01: rB = gate2; iB = gate3; break; + case 0b10: rB = gate4; iB = gate5; break; + case 0b11: rB = gate6; iB = gate7; break; + } + + const int posA = rowA * (colsA * 2) + (colA * 2); + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + } +} \ No newline at end of file diff --git a/src/kernel.cl.c b/src/kernel.cl.c index bcc822b..64b873a 100644 --- a/src/kernel.cl.c +++ b/src/kernel.cl.c @@ -422,6 +422,128 @@ unsigned char src_kernel_cl[] = { 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x69, 0x52, 0x3b, 0x0a, 0x0a, - 0x7d, 0x00 + 0x7d, 0x0a, 0x0a, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, + 0x76, 0x6f, 0x69, 0x64, 0x20, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f, + 0x6b, 0x6e, 0x6b, 0x5f, 0x33, 0x0a, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, + 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x52, 0x2c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x41, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, + 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, + 0x6f, 0x6c, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, + 0x74, 0x65, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, + 0x65, 0x31, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, + 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x33, + 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x34, 0x2c, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x35, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, + 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x36, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, + 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x37, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x2f, 0x2f, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, + 0x6e, 0x74, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x30, 0x20, 0x7b, 0x63, 0x70, 0x75, 0x5f, + 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, + 0x41, 0x20, 0x2a, 0x20, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, + 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, + 0x20, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, + 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, + 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, + 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x5f, 0x30, 0x3b, 0x20, + 0x7b, 0x63, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x20, 0x63, 0x6f, + 0x6c, 0x52, 0x20, 0x3c, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x3b, 0x20, + 0x63, 0x6f, 0x6c, 0x52, 0x2b, 0x2b, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x41, + 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, 0x20, 0x32, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, + 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2f, 0x20, 0x32, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, + 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x25, 0x20, 0x32, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x42, 0x20, 0x3d, 0x20, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x32, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x72, 0x42, 0x2c, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x73, 0x77, 0x69, 0x74, 0x63, 0x68, 0x20, 0x28, + 0x28, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3c, 0x3c, 0x20, 0x31, 0x29, 0x20, + 0x7c, 0x20, 0x63, 0x6f, 0x6c, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x30, + 0x62, 0x30, 0x30, 0x3a, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, + 0x74, 0x65, 0x30, 0x3b, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, + 0x74, 0x65, 0x31, 0x3b, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x61, 0x73, 0x65, 0x20, 0x30, 0x62, 0x30, 0x31, 0x3a, 0x20, 0x72, + 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x32, 0x3b, 0x20, 0x69, + 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x33, 0x3b, 0x20, 0x62, + 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x30, + 0x62, 0x31, 0x30, 0x3a, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, + 0x74, 0x65, 0x34, 0x3b, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, + 0x74, 0x65, 0x35, 0x3b, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x61, 0x73, 0x65, 0x20, 0x30, 0x62, 0x31, 0x31, 0x3a, 0x20, 0x72, + 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x36, 0x3b, 0x20, 0x69, + 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x37, 0x3b, 0x20, 0x62, + 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, + 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x41, 0x20, 0x2a, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, + 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, + 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, + 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, + 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, + 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, + 0x69, 0x72, 0x73, 0x74, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, + 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, + 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, + 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, + 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, + 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, + 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, + 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, + 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, + 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, + 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, + 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, + 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, + 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, + 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, + 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, + 0x00 }; -unsigned int src_kernel_cl_len = 5078; +unsigned int src_kernel_cl_len = 6541; diff --git a/src/kernel_cpu.cl.c b/src/kernel_cpu.cl.c index 908a689..ca9f605 100644 --- a/src/kernel_cpu.cl.c +++ b/src/kernel_cpu.cl.c @@ -159,3 +159,52 @@ void kernel_knk_2 ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR; } + +void kernel_knk_3 +( + float* ptrR, + float* ptrA, + const int rowsA, + const int colsA, + const float gate0, + const float gate1, + const float gate2, + const float gate3, + const float gate4, + const float gate5, + const float gate6, + const float gate7 + , const int get_global_id_0 +) +{ + const int rowsR = rowsA * 2; + const int colsR = colsA * 2; + const int rowR = get_global_id_0; + for (int colR = 0; colR < colsR; colR++) + { + const int rowA = rowR / 2; + const int colA = colR / 2; + const int rowB = rowR % 2; + const int colB = colR % 2; + float rB, iB; + switch ((rowB << 1) | colB) + { + case 0b00: rB = gate0; iB = gate1; break; + case 0b01: rB = gate2; iB = gate3; break; + case 0b10: rB = gate4; iB = gate5; break; + case 0b11: rB = gate6; iB = gate7; break; + } + + const int posA = rowA * (colsA * 2) + (colA * 2); + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; + } +} -- 2.39.5