}
}
-
void cpx_mtx_knk_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
int rowsR = rowsA * rowsB;
int colsR = colsA * colsB;
for (int i = 0; i < rowsR / 2; i++)
{
- for (int j = 0; j < colsR / 2; j++)
- {
- kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i, j);
- }
- }
-}
-
-void cpx_mtx_knk_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
- int rowsR = rowsA * rowsB;
- int colsR = colsA * colsB;
- for (int i = 0; i < rowsR / 2; i++)
- {
- kernel_knk_2x2_R(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
- }
-}
-
-void cpx_mtx_knk_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
- int rowsR = rowsA * rowsB;
- int colsR = colsA * colsB;
- for (int i = 0; i < rowsR / (2 * 1024); i++)
- {
- kernel_knk_2x2_Rx4(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
+ kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
}
}
//Setup kernel
- float gate0 = ptrB[0];
- float gate1 = ptrB[1];
- float gate2 = ptrB[2];
- float gate3 = ptrB[3];
- float gate4 = ptrB[4];
- float gate5 = ptrB[5];
- float gate6 = ptrB[6];
- float gate7 = ptrB[7];
- cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2", &err); gpuerr(clCreateKernel);
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg);
-
- //Run the program
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR / 2, colsR / 2}, NULL, 0, NULL, NULL);
- gpuerr(clEnqueueNDRangeKernel);
-
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
- err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
-
- //Read results
- err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
- gpuerr(clEnqueueReadBuffer);
-
- //Clean up
- err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
- err = clReleaseProgram(program); gpuerr(clReleaseProgram);
- err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-
-//This only works if ptrA is NxM where both N and X are divisible by two,
-// and ptrB is 2x2. If both are true, this is much more efficient than
-// the standard knk_metal() function.
-void cpx_mtx_knk_metal_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
- int rowsR = rowsA * rowsB;
- int colsR = colsA * colsB;
- //Create buffers
- size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
- size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
- cl_int err;
- cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer);
- cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer);
-
- //Populate buffers
- err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
-
- //Load and compile program
- cl_program program;
- if (cpx_mtx_cache == NULL)
- {
- program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err);
- gpuerr(clCreateProgramWithSource);
- err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
- gpuerr(clGetProgramInfo);
- cpx_mtx_cache = malloc(cpx_mtx_cache_len);
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
- gpuerr(clGetProgramInfo);
- }
- else
- {
- program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
- gpuerr(clCreateProgramWithBinary);
- err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- }
-
- //Setup kernel
-
float gate0 = ptrB[0];
float gate1 = ptrB[1];
float gate2 = ptrB[2];
}
-//This only works if ptrA is NxM where both N and X are divisible by two,
-// and ptrB is 2x2. If both are true, this is much more efficient than
-// the standard knk_metal() function.
-void cpx_mtx_knk_metal_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
- int rowsR = rowsA * rowsB;
- int colsR = colsA * colsB;
- //Create buffers
- size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
- size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
- cl_int err;
- cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer);
- cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer);
-
- //Populate buffers
- err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
-
- //Load and compile program
- cl_program program;
- if (cpx_mtx_cache == NULL)
- {
- program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err);
- gpuerr(clCreateProgramWithSource);
- err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
- gpuerr(clGetProgramInfo);
- cpx_mtx_cache = malloc(cpx_mtx_cache_len);
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
- gpuerr(clGetProgramInfo);
- }
- else
- {
- program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
- gpuerr(clCreateProgramWithBinary);
- err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- }
-
- //Setup kernel
-
- float gate0 = ptrB[0];
- float gate1 = ptrB[1];
- float gate2 = ptrB[2];
- float gate3 = ptrB[3];
- float gate4 = ptrB[4];
- float gate5 = ptrB[5];
- float gate6 = ptrB[6];
- float gate7 = ptrB[7];
- cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2_Rx4", &err); gpuerr(clCreateKernel);
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg);
-
- //Run the program
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / (2 * 1024)}, NULL, 0, NULL, NULL);
- gpuerr(clEnqueueNDRangeKernel);
-
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
- err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
-
- //Read results
- err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
- gpuerr(clEnqueueReadBuffer);
-
- //Clean up
- err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
- err = clReleaseProgram(program); gpuerr(clReleaseProgram);
- err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-
-
-/*-----------------------------------------------------------------------------------*/
-
-
-
-
#endif
\ No newline at end of file
}
__kernel void kernel_knk_2x2
-(
- __global float* ptrR,
- __global float* ptrA,
- const int rowsA,
- const int colsA,
- const float gate0,
- const float gate1,
- const float gate2,
- const float gate3,
- const float gate4,
- const float gate5,
- const float gate6,
- const float gate7
- , const int get_global_id_0 //{cpu_only}
- , const int get_global_id_1 //{cpu_only}
-)
-{
- const int rowsR = rowsA * 2;
- const int colsR = colsA * 2;
- const int rowR = get_global_id(0) * 2; //{gpu_only}
- const int colR = get_global_id(1) * 2; //{gpu_only}
- const int rowR = get_global_id_0 * 2; //{cpu_only}
- const int colR = get_global_id_1 * 2; //{cpu_only}
-
- const int rowA = rowR / 2;
- const int colA = colR / 2;
- const int posA = rowA * (colsA * 2) + (colA * 2);
- const float rA = ptrA[posA];
- const float iA = ptrA[posA + 1];
-
- for (int i = 0; i < 4; i++)
- {
- float rB, iB;
- switch (i)
- {
- case 0: rB = gate0; iB = gate1; break;
- case 1: rB = gate2; iB = gate3; break;
- case 2: rB = gate4; iB = gate5; break;
- case 3: rB = gate6; iB = gate7; break;
- }
-
- //(rA + iA)(rB + iB)
- const float first = rA * rB;
- const float outer = rA * iB;
- const float inner = iA * rB;
- const float lasts = iA * iB;
- switch (i)
- {
- case 0:
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
- break;
- case 1:
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
- break;
- case 2:
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
- break;
- case 3:
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
- break;
- }
-
- }
-
-}
-
-
-__kernel void kernel_knk_2x2_R
(
__global float* ptrR,
__global float* ptrA,
}
}
}
-
-
-__kernel void kernel_knk_2x2_Rx4
-(
- __global float* ptrR,
- __global float* ptrA,
- const int rowsA,
- const int colsA,
- const float gate0,
- const float gate1,
- const float gate2,
- const float gate3,
- const float gate4,
- const float gate5,
- const float gate6,
- const float gate7
- , const int get_global_id_0 //{cpu_only}
-)
-{
- const int rowsR = rowsA * 2;
- const int colsR = colsA * 2;
- const int block = get_global_id(0) * 2 * 1024; //{gpu_only}
- const int block = get_global_id_0 * 2 * 1024; //{cpu_only}
-
- for (int rowR = block; rowR < block + 2 * 1024; rowR += 2)
- {
- for (int colR = 0; colR < colsR; colR += 2)
- {
- const int rowA = rowR / 2;
- const int colA = colR / 2;
- const int posA = rowA * (colsA * 2) + (colA * 2);
- const float rA = ptrA[posA];
- const float iA = ptrA[posA + 1];
-
- for (int i = 0; i < 4; i++)
- {
- float rB, iB;
- switch (i)
- {
- case 0: rB = gate0; iB = gate1; break;
- case 1: rB = gate2; iB = gate3; break;
- case 2: rB = gate4; iB = gate5; break;
- case 3: rB = gate6; iB = gate7; break;
- }
-
- //(rA + iA)(rB + iB)
- const float first = rA * rB;
- const float outer = rA * iB;
- const float inner = iA * rB;
- const float lasts = iA * iB;
- switch (i)
- {
- case 0:
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
- break;
- case 1:
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
- ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
- break;
- case 2:
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
- break;
- case 3:
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
- ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
- break;
- }
-
- }
- }
- }
-}
\ No newline at end of file