From: miha-q <> Date: Fri, 8 Mar 2024 00:20:51 +0000 (-0500) Subject: Thu Mar 7 07:20:51 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=683d7aaf53b164d0512dade5f9729af8fb729d57;p=QAnsel.git Thu Mar 7 07:20:51 PM EST 2024 --- diff --git a/src/complex.c b/src/complex.c index 0ae576f..8fcdfbe 100644 --- a/src/complex.c +++ b/src/complex.c @@ -337,10 +337,11 @@ void cpx_mtx_dot_threads(float* ptrR, float* ptrA, float* ptrB, int rowsA, int c /*-----------------------------------------------------------------------------------*/ /*METAL*/ /*-----------------------------------------------------------------------------------*/ + #define CL_USE_DEPRECATED_OPENCL_1_2_APIS #define CL_TARGET_OPENCL_VERSION 300 #include -#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); } +#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU Error on line %i: %s.\n", __LINE__, clGetErrorString(x)); exit(1); } cl_platform_id cpx_mtx_platform_id; cl_device_id cpx_mtx_device_id; cl_context cpx_mtx_context; @@ -348,6 +349,81 @@ cl_command_queue cpx_mtx_command_queue; unsigned char* cpx_mtx_cache = NULL; size_t cpx_mtx_cache_len = 0; +#include +#include + +const char* clGetErrorString(cl_int err) +{ + switch (err) + { + case CL_SUCCESS: return "Success"; + case CL_DEVICE_NOT_FOUND: return "Device not found"; + case CL_DEVICE_NOT_AVAILABLE: return "Device not available"; + case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available"; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure"; + case CL_OUT_OF_RESOURCES: return "Out of resources"; + case CL_OUT_OF_HOST_MEMORY: return "Out of host memory"; + case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available"; + case CL_MEM_COPY_OVERLAP: return "Memory copy overlap"; + case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch"; + case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported"; + case CL_BUILD_PROGRAM_FAILURE: return "Program build failure"; + case CL_MAP_FAILURE: return "Map failure"; + case CL_MISALIGNED_SUB_BUFFER_OFFSET: return "Misaligned sub buffer offset"; + case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: return "Execution status error for events in wait list"; + case CL_COMPILE_PROGRAM_FAILURE: return "Compile program failure"; + case CL_LINKER_NOT_AVAILABLE: return "Linker not available"; + case CL_LINK_PROGRAM_FAILURE: return "Link program failure"; + case CL_DEVICE_PARTITION_FAILED: return "Device partition failed"; + case CL_KERNEL_ARG_INFO_NOT_AVAILABLE: return "Kernel argument information not available"; + case CL_INVALID_VALUE: return "Invalid value"; + case CL_INVALID_DEVICE_TYPE: return "Invalid device type"; + case CL_INVALID_PLATFORM: return "Invalid platform"; + case CL_INVALID_DEVICE: return "Invalid device"; + case CL_INVALID_CONTEXT: return "Invalid context"; + case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties"; + case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue"; + case CL_INVALID_HOST_PTR: return "Invalid host pointer"; + case CL_INVALID_MEM_OBJECT: return "Invalid memory object"; + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor"; + case CL_INVALID_IMAGE_SIZE: return "Invalid image size"; + case CL_INVALID_SAMPLER: return "Invalid sampler"; + case CL_INVALID_BINARY: return "Invalid binary"; + case CL_INVALID_BUILD_OPTIONS: return "Invalid build options"; + case CL_INVALID_PROGRAM: return "Invalid program"; + case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable"; + case CL_INVALID_KERNEL_NAME: return "Invalid kernel name"; + case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition"; + case CL_INVALID_KERNEL: return "Invalid kernel"; + case CL_INVALID_ARG_INDEX: return "Invalid argument index"; + case CL_INVALID_ARG_VALUE: return "Invalid argument value"; + case CL_INVALID_ARG_SIZE: return "Invalid argument size"; + case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments"; + case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension"; + case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size"; + case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size"; + case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset"; + case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list"; + case CL_INVALID_EVENT: return "Invalid event"; + case CL_INVALID_OPERATION: return "Invalid operation"; + case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object"; + case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size"; + case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level"; + case CL_INVALID_GLOBAL_WORK_SIZE: return "Invalid global work size"; + case CL_INVALID_PROPERTY: return "Invalid property"; + case CL_INVALID_IMAGE_DESCRIPTOR: return "Invalid image descriptor"; + case CL_INVALID_COMPILER_OPTIONS: return "Invalid compiler options"; + case CL_INVALID_LINKER_OPTIONS: return "Invalid linker options"; + case CL_INVALID_DEVICE_PARTITION_COUNT: return "Invalid device partition count"; + case CL_INVALID_PIPE_SIZE: return "Invalid pipe size"; + case CL_INVALID_DEVICE_QUEUE: return "Invalid device queue"; + case CL_INVALID_SPEC_ID: return "Invalid spec ID"; + case CL_MAX_SIZE_RESTRICTION_EXCEEDED: return "Max size restriction exceeded"; + default: return "Unknown error code"; + } +} + + uint8_t cpx_mtx_begin() { cl_uint count; @@ -424,25 +500,19 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float); cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); - gpuerr(clCreateBuffer); + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err); + cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(err); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(err); //Populate buffers - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); gpuerr(err); + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); gpuerr(err); //Load and compile program cl_program program; if (cpx_mtx_cache == NULL) { - program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); - gpuerr(clCreateProgramWithSource); + program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); gpuerr(err); err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { @@ -455,16 +525,14 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col free(log); exit(1); } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); gpuerr(err); cpx_mtx_cache = malloc(cpx_mtx_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); gpuerr(err); } else { program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); + gpuerr(err); err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { @@ -479,45 +547,33 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col } } - //Setup kernel - cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); - //printf("%i\n", err); - //printf("\t%s: %i\n", "CL_SUCCESS", CL_SUCCESS); - //printf("\t%s: %i\n", "CL_INVALID_PROGRAM", CL_INVALID_PROGRAM); - //printf("\t%s: %i\n", "CL_INVALID_PROGRAM_EXECUTABLE", CL_INVALID_PROGRAM_EXECUTABLE); - //printf("\t%s: %i\n", "CL_INVALID_KERNEL_NAME", CL_INVALID_KERNEL_NAME); - //printf("\t%s: %i\n", "CL_INVALID_KERNEL_DEFINITION", CL_INVALID_KERNEL_DEFINITION); - //printf("\t%s: %i\n", "CL_INVALID_VALUE", CL_INVALID_VALUE); - //printf("\t%s: %i\n", "CL_OUT_OF_RESOURCES", CL_OUT_OF_RESOURCES); - //printf("\t%s: %i\n", "CL_OUT_OF_HOST_MEMORY", CL_OUT_OF_HOST_MEMORY); - gpuerr(clCreateKernel); - - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(int), &rowsB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(int), &colsB); gpuerr(clSetKernelArg); + cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); gpuerr(err); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(err); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(err); + err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(err); + err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(err); + err = clSetKernelArg(kernel, 4, sizeof(int), &colsA); gpuerr(err); + err = clSetKernelArg(kernel, 5, sizeof(int), &rowsB); gpuerr(err); + err = clSetKernelArg(kernel, 6, sizeof(int), &colsB); gpuerr(err); //Run the program err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR, colsR}, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); + gpuerr(err); //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + err = clFlush(cpx_mtx_command_queue); gpuerr(err); + err = clFinish(cpx_mtx_command_queue); gpuerr(err); //Read results err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); + gpuerr(err); //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); + err = clReleaseKernel(kernel); gpuerr(err); + err = clReleaseProgram(program); gpuerr(err); + err = clReleaseMemObject(memA); gpuerr(err); + err = clReleaseMemObject(memB); gpuerr(err); + err = clReleaseMemObject(memR); gpuerr(err); } void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) @@ -529,22 +585,22 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float); cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err); + cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(err); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(err); //Populate buffers err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); + gpuerr(err); err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); + gpuerr(err); //Load and compile program cl_program program; if (cpx_mtx_cache == NULL) { program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); - gpuerr(clCreateProgramWithSource); + gpuerr(err); err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { @@ -558,15 +614,15 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col exit(1); } err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); + gpuerr(err); cpx_mtx_cache = malloc(cpx_mtx_cache_len); clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); + gpuerr(err); } else { program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); + gpuerr(err); err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { @@ -582,33 +638,33 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col } //Setup kernel - cl_kernel kernel = clCreateKernel(program, "kernel_knk", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(int), &rowsB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(int), &colsB); gpuerr(clSetKernelArg); + cl_kernel kernel = clCreateKernel(program, "kernel_knk", &err); gpuerr(err); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(err); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(err); + err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(err); + err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(err); + err = clSetKernelArg(kernel, 4, sizeof(int), &colsA); gpuerr(err); + err = clSetKernelArg(kernel, 5, sizeof(int), &rowsB); gpuerr(err); + err = clSetKernelArg(kernel, 6, sizeof(int), &colsB); gpuerr(err); //Run the program err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR}, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); + gpuerr(err); //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + err = clFlush(cpx_mtx_command_queue); gpuerr(err); + err = clFinish(cpx_mtx_command_queue); gpuerr(err); //Read results err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); + gpuerr(err); //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); + err = clReleaseKernel(kernel); gpuerr(err); + err = clReleaseProgram(program); gpuerr(err); + err = clReleaseMemObject(memA); gpuerr(err); + err = clReleaseMemObject(memB); gpuerr(err); + err = clReleaseMemObject(memR); gpuerr(err); } //This only works if ptrA is NxM where both N and X are divisible by two, @@ -622,19 +678,19 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(err); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(err); //Populate buffers err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); + gpuerr(err); //Load and compile program cl_program program; if (cpx_mtx_cache == NULL) { program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); - gpuerr(clCreateProgramWithSource); + gpuerr(err); err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { @@ -648,15 +704,15 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int exit(1); } err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); + gpuerr(err); cpx_mtx_cache = malloc(cpx_mtx_cache_len); clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); + gpuerr(err); } else { program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); + gpuerr(err); err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); if (err != CL_SUCCESS) { @@ -681,49 +737,39 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int float gate5 = ptrB[5]; float gate6 = ptrB[6]; float gate7 = ptrB[7]; - cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); + cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2", &err); gpuerr(err); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(err); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(err); + err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(err); + err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(err); + err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(err); + err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(err); + err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(err); + err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(err); + err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(err); + err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(err); + err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(err); + err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(err); size_t q = 2; - size_t max_work_group_size; - err = clGetDeviceInfo(cpx_mtx_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group_size, NULL); - if (err != CL_SUCCESS) { - printf("Error getting device info\n"); - exit(1); - } - printf(">%lu<\n", max_work_group_size); - exit(1); - //Run the program err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2}, &q, 0, NULL, NULL); - - gpuerr(clEnqueueNDRangeKernel); + gpuerr(err); //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + err = clFlush(cpx_mtx_command_queue); gpuerr(err); + err = clFinish(cpx_mtx_command_queue); gpuerr(err); //Read results err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); + gpuerr(err); //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); + err = clReleaseKernel(kernel); gpuerr(err); + err = clReleaseProgram(program); gpuerr(err); + err = clReleaseMemObject(memA); gpuerr(err); + err = clReleaseMemObject(memR); gpuerr(err); }