#include "gpu_mmul.cl.c"
+#include "gpu_knk.cl.c"
cl_platform_id GPU_platform_id;
cl_device_id GPU_device_id;
cl_context GPU_context;
cl_command_queue GPU_command_queue;
unsigned char* GPU_mmul_cache = NULL;
size_t GPU_mmul_cache_len = 0;
+unsigned char* GPU_knk_cache = NULL;
+size_t GPU_knk_cache_len = 0;
uint8_t GPU_init()
{
fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n");
exit(1);
}
+}
+
+void GPU_knk(float* ptrR, size_t rowsR, size_t colsR, float* ptrA, size_t rowsA, size_t colsA, float* ptrB, size_t rowsB, size_t colsB)
+{
+ //Create buffers
+ size_t sizeA = rowsA * colsA;
+ size_t sizeB = rowsB * colsB;
+ size_t sizeR = rowsR * colsR;
+ cl_int err;
+ cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n");
+ exit(1);
+ }
+ cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n");
+ exit(1);
+ }
+ cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n");
+ exit(1);
+ }
+ //Populate buffers
+ err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n");
+ exit(1);
+ }
+ err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n");
+ exit(1);
+ }
+
+ cl_program program;
+ if (GPU_knk_cache == NULL)
+ {
+ //Load and compile program
+ program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_knk_cl}, NULL, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clCreateProgramWithSource() failed.\n");
+ exit(1);
+ }
+ err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
+ size_t log_size;
+ clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+ char* log = malloc(log_size);
+ clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+ printf("%s", log);
+ free(log);
+ exit(1);
+ }
+ err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &GPU_knk_cache_len, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clGetProgramInfo() failed.\n");
+ exit(1);
+ }
+ GPU_knk_cache = malloc(GPU_knk_cache_len);
+ clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &GPU_knk_cache, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clGetProgramInfo() failed.\n");
+ exit(1);
+ }
+ }
+ else
+ {
+ program = clCreateProgramWithBinary(GPU_context, 1, &GPU_device_id, &GPU_knk_cache_len, (const unsigned char**)&GPU_knk_cache, NULL, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clCreateProgramWithBinary() failed.\n");
+ exit(1);
+ }
+ }
+
+ //Setup kernel
+ cl_kernel kernel = clCreateKernel(program, "gpu_knk", &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clCreateKernel() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 1, sizeof(int), &rowsR);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 2, sizeof(int), &colsR);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&memA);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 4, sizeof(int), &rowsB);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 5, sizeof(int), &colsB);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&memA);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 7, sizeof(int), &rowsA);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+ err = clSetKernelArg(kernel, 8, sizeof(int), &colsA);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clSetKernelArg() failed.\n");
+ exit(1);
+ }
+
+ //Run the program
+ size_t work_size[] = {rowsR, colsR};
+ err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clEnqueueNDRangeKernel() failed.\n");
+ exit(1);
+ }
+ //Wait for completion
+ err = clFlush(GPU_command_queue);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clFlush() failed.\n");
+ exit(1);
+ }
+ err = clFinish(GPU_command_queue);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clFinish() failed.\n");
+ exit(1);
+ }
+ //Read results
+ err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n");
+ exit(1);
+ }
+ //Clean up
+ err = clReleaseKernel(kernel);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clReleaseKernel() failed.\n");
+ exit(1);
+ }
+ err = clReleaseProgram(program);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clReleaseProgram() failed.\n");
+ exit(1);
+ }
+ err = clReleaseMemObject(memA);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n");
+ exit(1);
+ }
+ err = clReleaseMemObject(memB);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n");
+ exit(1);
+ }
+ err = clReleaseMemObject(memR);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clReleaseMemObject() failed.\n");
+ exit(1);
+ }
}
\ No newline at end of file
const int colsB
)
{
+ int rowR = get_global_id(0);
+ int colR = get_global_id(1);
- for (size_t rowR = 0; rowR < rowsR; rowR++)
- {
- size_t a = data->ID * data->BlockSize;
- size_t b = (data->ID + 1) * data->BlockSize;
- if (data->ID == data->Last) b += data->Continue;
- for (size_t colR = a; colR < b; colR++)
- {
- size_t rowA = rowR / rowsB;
- size_t colA = colR / colsB;
- size_t rowB = rowR % rowsB;
- size_t colB = colR % colsB;
+ int rowA = rowR / rowsB;
+ int colA = colR / colsB;
+ int rowB = rowR % rowsB;
+ int colB = colR % colsB;
- float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
- float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
- float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
- float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
+ float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
+ float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
+ float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
+ float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
- float first = r1 * r2; //real
- float outer = r1 * i2; //imaginary
- float inner = i1 * r2; //imaginary
- float last = -(i1 * i2); //real
- r1 = first + last;
- i1 = outer + inner;
+ float first = r1 * r2; //real
+ float outer = r1 * i2; //imaginary
+ float inner = i1 * r2; //imaginary
+ float last = -(i1 * i2); //real
+ r1 = first + last;
+ i1 = outer + inner;
+
+ ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
+ ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
+ ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
+ ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
+}
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
-
- }
- }
-}
\ No newline at end of file