From 09de4a99a6a5e683b2269dd34737749a0bb0e9c5 Mon Sep 17 00:00:00 2001 From: miha-q <> Date: Sun, 3 Mar 2024 22:57:48 -0500 Subject: [PATCH] Sun Mar 3 10:57:48 PM EST 2024 --- Makefile | 9 ++ src/QAnsel.c | 24 +++++- src/complex.c | 1 - src/gpu/gpu.c | 211 +++++++++++++++++++++++++++++++++++++++++++++ src/gpu/gpu_knk.cl | 51 +++++------ src/gpu/gpu_test.c | 13 +++ 6 files changed, 278 insertions(+), 31 deletions(-) diff --git a/Makefile b/Makefile index 66dac57..65d6226 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,17 @@ all: + + #gpu mmul mv src/gpu/gpu_mmul.cl src/gpu/.gpu_mmul.cl bash -c 'echo -ne "$$(cat src/gpu/.gpu_mmul.cl)\x00" > src/gpu/gpu_mmul.cl' xxd -i src/gpu/gpu_mmul.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/gpu/gpu_mmul.cl.c mv src/gpu/.gpu_mmul.cl src/gpu/gpu_mmul.cl + + #gpu knk + mv src/gpu/gpu_knk.cl src/gpu/.gpu_knk.cl + bash -c 'echo -ne "$$(cat src/gpu/.gpu_knk.cl)\x00" > src/gpu/gpu_knk.cl' + xxd -i src/gpu/gpu_knk.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/gpu/gpu_knk.cl.c + mv src/gpu/.gpu_knk.cl src/gpu/gpu_knk.cl + gcc src/QAnsel.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread rm -f src/gpu/*.cl.c diff --git a/src/QAnsel.c b/src/QAnsel.c index dc3ab57..cb5f0c3 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -218,12 +218,35 @@ void qansel_instruction(cpx_mtx_t* stateVector, uint8_t qubitCount, QInstr* inst tmp.rows = filter.rows * gate.rows; tmp.cols = filter.cols * gate.cols; tmp.ptr = malloc((tmp.rows * 2) * (tmp.cols * 2) * sizeof(float)); + + #ifdef GPU_ENABLED + if (USE_GPU) + { + GPU_knk + ( + tmp.ptr, tmp.rows, tmp.cols, + filter.ptr, filter.rows, filter.cols, + gate.ptr, gate.rows, gate.cols + ); + } + else + { + cpx_ncpx_knk_mt + ( + tmp.ptr, tmp.rows, tmp.cols, + filter.ptr, filter.rows, filter.cols, + gate.ptr, gate.rows, gate.cols + ); + } + #else cpx_ncpx_knk_mt ( tmp.ptr, tmp.rows, tmp.cols, filter.ptr, filter.rows, filter.cols, gate.ptr, gate.rows, gate.cols ); + #endif + free(filter.ptr); filter.ptr = tmp.ptr; @@ -1373,7 +1396,6 @@ void main(int argc, char** argv) #ifdef GPU_ENABLED USE_GPU = GPU_init(); #endif - USE_GPU = 0; RANDOM_FILE = fopen("/dev/TrueRNG0", "r"); if (!RANDOM_FILE) RANDOM_FILE = fopen("/dev/random", "r"); process(argc, argv); diff --git a/src/complex.c b/src/complex.c index 54548e3..453af0b 100644 --- a/src/complex.c +++ b/src/complex.c @@ -377,7 +377,6 @@ void* cpx_ncpx_knk_mtc(void *context) ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; - } } } diff --git a/src/gpu/gpu.c b/src/gpu/gpu.c index 2930596..adcefe3 100644 --- a/src/gpu/gpu.c +++ b/src/gpu/gpu.c @@ -1,10 +1,13 @@ #include "gpu_mmul.cl.c" +#include "gpu_knk.cl.c" cl_platform_id GPU_platform_id; cl_device_id GPU_device_id; cl_context GPU_context; cl_command_queue GPU_command_queue; unsigned char* GPU_mmul_cache = NULL; size_t GPU_mmul_cache_len = 0; +unsigned char* GPU_knk_cache = NULL; +size_t GPU_knk_cache_len = 0; uint8_t GPU_init() { @@ -254,4 +257,212 @@ void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n"); exit(1); } +} + +void GPU_knk(float* ptrR, size_t rowsR, size_t colsR, float* ptrA, size_t rowsA, size_t colsA, float* ptrB, size_t rowsB, size_t colsB) +{ + //Create buffers + size_t sizeA = rowsA * colsA; + size_t sizeB = rowsB * colsB; + size_t sizeR = rowsR * colsR; + cl_int err; + cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n"); + exit(1); + } + cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n"); + exit(1); + } + cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n"); + exit(1); + } + //Populate buffers + err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n"); + exit(1); + } + err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n"); + exit(1); + } + + cl_program program; + if (GPU_knk_cache == NULL) + { + //Load and compile program + program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_knk_cl}, NULL, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clCreateProgramWithSource() failed.\n"); + exit(1); + } + err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &GPU_knk_cache_len, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clGetProgramInfo() failed.\n"); + exit(1); + } + GPU_knk_cache = malloc(GPU_knk_cache_len); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &GPU_knk_cache, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clGetProgramInfo() failed.\n"); + exit(1); + } + } + else + { + program = clCreateProgramWithBinary(GPU_context, 1, &GPU_device_id, &GPU_knk_cache_len, (const unsigned char**)&GPU_knk_cache, NULL, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clCreateProgramWithBinary() failed.\n"); + exit(1); + } + } + + //Setup kernel + cl_kernel kernel = clCreateKernel(program, "gpu_knk", &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clCreateKernel() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 1, sizeof(int), &rowsR); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 2, sizeof(int), &colsR); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&memA); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 4, sizeof(int), &rowsB); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 5, sizeof(int), &colsB); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&memA); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 7, sizeof(int), &rowsA); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + err = clSetKernelArg(kernel, 8, sizeof(int), &colsA); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n"); + exit(1); + } + + //Run the program + size_t work_size[] = {rowsR, colsR}; + err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clEnqueueNDRangeKernel() failed.\n"); + exit(1); + } + //Wait for completion + err = clFlush(GPU_command_queue); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clFlush() failed.\n"); + exit(1); + } + err = clFinish(GPU_command_queue); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clFinish() failed.\n"); + exit(1); + } + //Read results + err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n"); + exit(1); + } + //Clean up + err = clReleaseKernel(kernel); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clReleaseKernel() failed.\n"); + exit(1); + } + err = clReleaseProgram(program); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clReleaseProgram() failed.\n"); + exit(1); + } + err = clReleaseMemObject(memA); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n"); + exit(1); + } + err = clReleaseMemObject(memB); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n"); + exit(1); + } + err = clReleaseMemObject(memR); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n"); + exit(1); + } } \ No newline at end of file diff --git a/src/gpu/gpu_knk.cl b/src/gpu/gpu_knk.cl index 3be09b8..79f5795 100644 --- a/src/gpu/gpu_knk.cl +++ b/src/gpu/gpu_knk.cl @@ -11,36 +11,29 @@ __kernel void gpu_knk const int colsB ) { + int rowR = get_global_id(0); + int colR = get_global_id(1); - for (size_t rowR = 0; rowR < rowsR; rowR++) - { - size_t a = data->ID * data->BlockSize; - size_t b = (data->ID + 1) * data->BlockSize; - if (data->ID == data->Last) b += data->Continue; - for (size_t colR = a; colR < b; colR++) - { - size_t rowA = rowR / rowsB; - size_t colA = colR / colsB; - size_t rowB = rowR % rowsB; - size_t colB = colR % colsB; + int rowA = rowR / rowsB; + int colA = colR / colsB; + int rowB = rowR % rowsB; + int colB = colR % colsB; - float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; - float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; - float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; - float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; + float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; + float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; + float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; + float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; - float first = r1 * r2; //real - float outer = r1 * i2; //imaginary - float inner = i1 * r2; //imaginary - float last = -(i1 * i2); //real - r1 = first + last; - i1 = outer + inner; + float first = r1 * r2; //real + float outer = r1 * i2; //imaginary + float inner = i1 * r2; //imaginary + float last = -(i1 * i2); //real + r1 = first + last; + i1 = outer + inner; + + ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; + ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; + ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; + ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; +} - ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; - ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; - ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; - ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; - - } - } -} \ No newline at end of file diff --git a/src/gpu/gpu_test.c b/src/gpu/gpu_test.c index 6c75205..68d8baa 100644 --- a/src/gpu/gpu_test.c +++ b/src/gpu/gpu_test.c @@ -12,6 +12,7 @@ int get_global_id(int id) } } #include "gpu_mmul.cl" +#include "gpu_knk.cl" void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) { @@ -26,6 +27,18 @@ void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, } } +void GPU_knk(float* ptrR, size_t rowsR, size_t colsR, float* ptrA, size_t rowsA, size_t colsA, float* ptrB, size_t rowsB, size_t colsB) +{ + for (int i = 0; i < rowsR; i++) + { + for (int j = 0; j < colsR; j++) + { + GPU_GLOBAL_ID_0 = i; + GPU_GLOBAL_ID_1 = j; + gpu_knk(ptrR, rowsR, colsR, ptrA, rowsA, colsA, ptrB, rowsB, colsB); + } + } +} uint8_t GPU_init() { -- 2.39.5