From: miha-q <> Date: Thu, 7 Mar 2024 23:16:15 +0000 (-0500) Subject: Thu Mar 7 06:16:15 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=169afd37e117e83c47804895e4e25cf4728b8d04;p=QAnsel.git Thu Mar 7 06:16:15 PM EST 2024 --- diff --git a/src/QAnsel.c b/src/QAnsel.c index 5f4ef13..ac7e82b 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -223,23 +223,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); us2 = get_time(); printf("\tMetal2x2: %lu\n", us2 - us1); - us1 = get_time(); - cpx_mtx_knk_metal_2x2_R(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); - us2 = get_time(); - printf("\tMetal2x2_R: %lu\n", us2 - us1); - if (filter.rows / 1024 == 0) - { - printf("\tMetal2x2_Rx4: Invalid\n"); - } - else - { - us1 = get_time(); - cpx_mtx_knk_metal_2x2_Rx4(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); - us2 = get_time(); - printf("\tMetal2x2_Rx4: %lu\n", us2 - us1); - } - us1 = get_time(); for (int i = 0; i < filter.rows; i++) { diff --git a/src/complex.c b/src/complex.c index 79e1e42..02f8825 100644 --- a/src/complex.c +++ b/src/complex.c @@ -144,37 +144,13 @@ void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, in } } - void cpx_mtx_knk_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) { int rowsR = rowsA * rowsB; int colsR = colsA * colsB; for (int i = 0; i < rowsR / 2; i++) { - for (int j = 0; j < colsR / 2; j++) - { - kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i, j); - } - } -} - -void cpx_mtx_knk_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) -{ - int rowsR = rowsA * rowsB; - int colsR = colsA * colsB; - for (int i = 0; i < rowsR / 2; i++) - { - kernel_knk_2x2_R(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i); - } -} - -void cpx_mtx_knk_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) -{ - int rowsR = rowsA * rowsB; - int colsR = colsA * colsB; - for (int i = 0; i < rowsR / (2 * 1024); i++) - { - kernel_knk_2x2_Rx4(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i); + kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i); } } @@ -632,109 +608,6 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int //Setup kernel - float gate0 = ptrB[0]; - float gate1 = ptrB[1]; - float gate2 = ptrB[2]; - float gate3 = ptrB[3]; - float gate4 = ptrB[4]; - float gate5 = ptrB[5]; - float gate6 = ptrB[6]; - float gate7 = ptrB[7]; - cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); - - //Run the program - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR / 2, colsR / 2}, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); - - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); - - //Read results - err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); - - //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); -} - -//This only works if ptrA is NxM where both N and X are divisible by two, -// and ptrB is 2x2. If both are true, this is much more efficient than -// the standard knk_metal() function. -void cpx_mtx_knk_metal_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) -{ - int rowsR = rowsA * rowsB; - int colsR = colsA * colsB; - //Create buffers - size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); - size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); - cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); - - //Populate buffers - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - - //Load and compile program - cl_program program; - if (cpx_mtx_cache == NULL) - { - program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); - gpuerr(clCreateProgramWithSource); - err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); - cpx_mtx_cache = malloc(cpx_mtx_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); - } - else - { - program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); - err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - } - - //Setup kernel - float gate0 = ptrB[0]; float gate1 = ptrB[1]; float gate2 = ptrB[2]; @@ -777,113 +650,4 @@ void cpx_mtx_knk_metal_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, i } -//This only works if ptrA is NxM where both N and X are divisible by two, -// and ptrB is 2x2. If both are true, this is much more efficient than -// the standard knk_metal() function. -void cpx_mtx_knk_metal_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) -{ - int rowsR = rowsA * rowsB; - int colsR = colsA * colsB; - //Create buffers - size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); - size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); - cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); - - //Populate buffers - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - - //Load and compile program - cl_program program; - if (cpx_mtx_cache == NULL) - { - program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); - gpuerr(clCreateProgramWithSource); - err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); - cpx_mtx_cache = malloc(cpx_mtx_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); - } - else - { - program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); - err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - } - - //Setup kernel - - float gate0 = ptrB[0]; - float gate1 = ptrB[1]; - float gate2 = ptrB[2]; - float gate3 = ptrB[3]; - float gate4 = ptrB[4]; - float gate5 = ptrB[5]; - float gate6 = ptrB[6]; - float gate7 = ptrB[7]; - cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2_Rx4", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); - - //Run the program - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / (2 * 1024)}, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); - - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); - - //Read results - err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); - - //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); -} - - -/*-----------------------------------------------------------------------------------*/ - - - - #endif \ No newline at end of file diff --git a/src/kernel.cl b/src/kernel.cl index 12646ed..d11e287 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -87,78 +87,6 @@ __kernel void kernel_knk } __kernel void kernel_knk_2x2 -( - __global float* ptrR, - __global float* ptrA, - const int rowsA, - const int colsA, - const float gate0, - const float gate1, - const float gate2, - const float gate3, - const float gate4, - const float gate5, - const float gate6, - const float gate7 - , const int get_global_id_0 //{cpu_only} - , const int get_global_id_1 //{cpu_only} -) -{ - const int rowsR = rowsA * 2; - const int colsR = colsA * 2; - const int rowR = get_global_id(0) * 2; //{gpu_only} - const int colR = get_global_id(1) * 2; //{gpu_only} - const int rowR = get_global_id_0 * 2; //{cpu_only} - const int colR = get_global_id_1 * 2; //{cpu_only} - - const int rowA = rowR / 2; - const int colA = colR / 2; - const int posA = rowA * (colsA * 2) + (colA * 2); - const float rA = ptrA[posA]; - const float iA = ptrA[posA + 1]; - - for (int i = 0; i < 4; i++) - { - float rB, iB; - switch (i) - { - case 0: rB = gate0; iB = gate1; break; - case 1: rB = gate2; iB = gate3; break; - case 2: rB = gate4; iB = gate5; break; - case 3: rB = gate6; iB = gate7; break; - } - - //(rA + iA)(rB + iB) - const float first = rA * rB; - const float outer = rA * iB; - const float inner = iA * rB; - const float lasts = iA * iB; - switch (i) - { - case 0: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; - break; - case 1: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; - break; - case 2: - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; - break; - case 3: - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; - break; - } - - } - -} - - -__kernel void kernel_knk_2x2_R ( __global float* ptrR, __global float* ptrA, @@ -227,77 +155,3 @@ __kernel void kernel_knk_2x2_R } } } - - -__kernel void kernel_knk_2x2_Rx4 -( - __global float* ptrR, - __global float* ptrA, - const int rowsA, - const int colsA, - const float gate0, - const float gate1, - const float gate2, - const float gate3, - const float gate4, - const float gate5, - const float gate6, - const float gate7 - , const int get_global_id_0 //{cpu_only} -) -{ - const int rowsR = rowsA * 2; - const int colsR = colsA * 2; - const int block = get_global_id(0) * 2 * 1024; //{gpu_only} - const int block = get_global_id_0 * 2 * 1024; //{cpu_only} - - for (int rowR = block; rowR < block + 2 * 1024; rowR += 2) - { - for (int colR = 0; colR < colsR; colR += 2) - { - const int rowA = rowR / 2; - const int colA = colR / 2; - const int posA = rowA * (colsA * 2) + (colA * 2); - const float rA = ptrA[posA]; - const float iA = ptrA[posA + 1]; - - for (int i = 0; i < 4; i++) - { - float rB, iB; - switch (i) - { - case 0: rB = gate0; iB = gate1; break; - case 1: rB = gate2; iB = gate3; break; - case 2: rB = gate4; iB = gate5; break; - case 3: rB = gate6; iB = gate7; break; - } - - //(rA + iA)(rB + iB) - const float first = rA * rB; - const float outer = rA * iB; - const float inner = iA * rB; - const float lasts = iA * iB; - switch (i) - { - case 0: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; - break; - case 1: - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; - ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; - break; - case 2: - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; - break; - case 3: - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; - ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; - break; - } - - } - } - } -} \ No newline at end of file