From: miha-q <> Date: Sun, 3 Mar 2024 07:12:43 +0000 (-0500) Subject: Sun Mar 3 02:12:43 AM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=e1729541f9b964fd62d686ff665c00a3e1732da5;p=QAnsel.git Sun Mar 3 02:12:43 AM EST 2024 --- diff --git a/Makefile b/Makefile index 2b53049..66dac57 100644 --- a/Makefile +++ b/Makefile @@ -5,4 +5,4 @@ all: xxd -i src/gpu/gpu_mmul.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/gpu/gpu_mmul.cl.c mv src/gpu/.gpu_mmul.cl src/gpu/gpu_mmul.cl gcc src/QAnsel.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread - rm -f src/*.cl.c + rm -f src/gpu/*.cl.c diff --git a/src/QAnsel.c b/src/QAnsel.c index 5c9c74e..18c10fc 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -1373,7 +1373,6 @@ void main(int argc, char** argv) #ifdef GPU_ENABLED USE_GPU = GPU_init(); #endif - USE_GPU = 0; RANDOM_FILE = fopen("/dev/TrueRNG0", "r"); if (!RANDOM_FILE) RANDOM_FILE = fopen("/dev/random", "r"); diff --git a/src/gpu/gpu.c b/src/gpu/gpu.c index 9b68ed7..2bba1ce 100644 --- a/src/gpu/gpu.c +++ b/src/gpu/gpu.c @@ -110,18 +110,18 @@ void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, if (GPU_mmul_cache == NULL) { //Load and compile program - char* tmp = malloc(src_gpu_mmul_cl_len); + char tmp[src_gpu_mmul_cl_len]; memcpy(tmp, src_gpu_mmul_cl, src_gpu_mmul_cl_len); const char* ptr = (const char*)src_gpu_mmul_cl; program = clCreateProgramWithSource(GPU_context, 1, (const char**)&tmp, NULL, &err); if (err != CL_SUCCESS) { - free(tmp); + //free(tmp); fprintf(stderr, "GPU fatal error: clCreateProgramWithSource() failed.\n"); exit(1); } err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL); - free(tmp); + //free(tmp); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); diff --git a/src/gpu/gpu_knk.cl b/src/gpu/gpu_knk.cl new file mode 100644 index 0000000..3be09b8 --- /dev/null +++ b/src/gpu/gpu_knk.cl @@ -0,0 +1,46 @@ +__kernel void gpu_knk +( + __global float* ptrR, + const int rowsR, + const int colsR, + __global float* ptrA, + const int rowsA, + const int colsA, + __global float* ptrB, + const int rowsB, + const int colsB +) +{ + + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + size_t a = data->ID * data->BlockSize; + size_t b = (data->ID + 1) * data->BlockSize; + if (data->ID == data->Last) b += data->Continue; + for (size_t colR = a; colR < b; colR++) + { + size_t rowA = rowR / rowsB; + size_t colA = colR / colsB; + size_t rowB = rowR % rowsB; + size_t colB = colR % colsB; + + float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; + float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; + float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; + float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; + + float first = r1 * r2; //real + float outer = r1 * i2; //imaginary + float inner = i1 * r2; //imaginary + float last = -(i1 * i2); //real + r1 = first + last; + i1 = outer + inner; + + ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; + ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; + ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; + ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; + + } + } +} \ No newline at end of file diff --git a/src/gpu/gpu_mmul.cl.c b/src/gpu/gpu_mmul.cl.c deleted file mode 100644 index 0cc1d45..0000000 --- a/src/gpu/gpu_mmul.cl.c +++ /dev/null @@ -1,58 +0,0 @@ -unsigned char src_gpu_mmul_cl[] = { - 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x67, 0x70, 0x75, 0x5f, 0x6d, 0x6d, 0x75, 0x6c, 0x0a, 0x28, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, - 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, - 0x52, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, - 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, - 0x74, 0x72, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, - 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, - 0x20, 0x70, 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, - 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, - 0x6e, 0x74, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x0a, 0x29, 0x0a, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x3d, 0x20, - 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, - 0x77, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20, 0x3d, 0x20, - 0x72, 0x6f, 0x77, 0x73, 0x41, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, - 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, - 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, - 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, - 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, - 0x64, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, - 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x2c, 0x20, 0x70, 0x6f, 0x73, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, - 0x73, 0x75, 0x6d, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, - 0x70, 0x6f, 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, - 0x2b, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x63, 0x6f, 0x6c, - 0x73, 0x52, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x28, 0x69, 0x6e, 0x74, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, - 0x20, 0x69, 0x20, 0x3c, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x3b, - 0x20, 0x69, 0x2b, 0x2b, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, - 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x69, 0x20, 0x2b, 0x20, 0x72, - 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, - 0x20, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, - 0x20, 0x2b, 0x20, 0x69, 0x20, 0x2a, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, - 0x6d, 0x20, 0x2b, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, - 0x73, 0x41, 0x5d, 0x20, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, - 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, 0x77, - 0x52, 0x20, 0x2a, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2b, 0x20, - 0x63, 0x6f, 0x6c, 0x52, 0x5d, 0x20, 0x3d, 0x20, 0x73, 0x75, 0x6d, 0x3b, - 0x0a, 0x7d, 0x00 -}; -unsigned int src_gpu_mmul_cl_len = 651;