#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); }
-#include "gpu_mmul.cl.c"
-#include "gpu_knk.cl.c"
+#include "kernel.cl.c"
cl_platform_id GPU_platform_id;
cl_device_id GPU_device_id;
cl_context GPU_context;
void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared)
{
- printf("a\n");
//Create buffers
size_t sizeA = rowsA * shared;
size_t sizeB = shared * colsB;
cl_program program;
if (GPU_cache == NULL)
{
- program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_mmul_cl}, NULL, &err);
+ program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_kernel_cl}, NULL, &err);
gpuerr(clCreateProgramWithSource);
err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL);
if (err != CL_SUCCESS)
void GPU_knk(float* ptrR, int rowsR, int colsR, float* ptrA, int rowsA, int colsA, float* ptrB, int rowsB, int colsB)
{
- printf("b\n");
//Create buffers
size_t sizeA = (rowsA * 2) * (colsA * 2);
size_t sizeB = (rowsB * 2) * (colsB * 2);
cl_program program;
if (GPU_cache == NULL)
{
- program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_mmul_cl}, NULL, &err);
+ program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_kernel_cl}, NULL, &err);
gpuerr(clCreateProgramWithSource);
err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL);
if (err != CL_SUCCESS)
+++ /dev/null
-__kernel void gpu_knk
-(
- __global float* ptrR,
- const int rowsR,
- const int colsR,
- __global float* ptrA,
- const int rowsA,
- const int colsA,
- __global float* ptrB,
- const int rowsB,
- const int colsB
-)
-{
- int rowR = get_global_id(0);
- int colR = get_global_id(1);
-
- int rowA = rowR / rowsB;
- int colA = colR / colsB;
- int rowB = rowR % rowsB;
- int colB = colR % colsB;
-
- float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
- float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
- float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
- float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
-
- float first = r1 * r2; //real
- float outer = r1 * i2; //imaginary
- float inner = i1 * r2; //imaginary
- float last = -(i1 * i2); //real
- r1 = first + last;
- i1 = outer + inner;
-
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
-}
-
+++ /dev/null
-__kernel void gpu_mmul
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsA,
- const int colsB,
- const int shared
-)
-{
- const int colsA = shared;
- const int rowsB = shared;
- const int rowsR = rowsA;
- const int colsR = colsB;
- const int rowR = get_global_id(0);
- const int colR = get_global_id(1);
- int posA, posB;
- float sum = 0;
-
- const int posR = colR + rowR * colsR;
-
- for (int i = 0; i < shared; i++)
- {
- int posA = i + rowR * colsA;
- int posB = colR + i * colsB;
- sum += ptrA[posA] * ptrB[posB];
- }
- ptrR[rowR * colsR + colR] = sum;
-}
-
-__kernel void gpu_knk
-(
- __global float* ptrR,
- const int rowsR,
- const int colsR,
- __global float* ptrA,
- const int rowsA,
- const int colsA,
- __global float* ptrB,
- const int rowsB,
- const int colsB
-)
-{
- int rowR = get_global_id(0);
- int colR = get_global_id(1);
-
- int rowA = rowR / rowsB;
- int colA = colR / colsB;
- int rowB = rowR % rowsB;
- int colB = colR % colsB;
-
- float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
- float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
- float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
- float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
-
- float first = r1 * r2; //real
- float outer = r1 * i2; //imaginary
- float inner = i1 * r2; //imaginary
- float last = -(i1 * i2); //real
- r1 = first + last;
- i1 = outer + inner;
-
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
-}
-
--- /dev/null
+__kernel void gpu_mmul
+(
+ __global float* ptrR,
+ __global float* ptrA,
+ __global float* ptrB,
+ const int rowsA,
+ const int colsB,
+ const int shared
+)
+{
+ const int colsA = shared;
+ const int rowsB = shared;
+ const int rowsR = rowsA;
+ const int colsR = colsB;
+ const int rowR = get_global_id(0);
+ const int colR = get_global_id(1);
+ int posA, posB;
+ float sum = 0;
+
+ const int posR = colR + rowR * colsR;
+
+ for (int i = 0; i < shared; i++)
+ {
+ int posA = i + rowR * colsA;
+ int posB = colR + i * colsB;
+ sum += ptrA[posA] * ptrB[posB];
+ }
+ ptrR[rowR * colsR + colR] = sum;
+}
+
+__kernel void gpu_knk
+(
+ __global float* ptrR,
+ const int rowsR,
+ const int colsR,
+ __global float* ptrA,
+ const int rowsA,
+ const int colsA,
+ __global float* ptrB,
+ const int rowsB,
+ const int colsB
+)
+{
+ int rowR = get_global_id(0);
+ int colR = get_global_id(1);
+
+ int rowA = rowR / rowsB;
+ int colA = colR / colsB;
+ int rowB = rowR % rowsB;
+ int colB = colR % colsB;
+
+ float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
+ float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
+ float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
+ float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
+
+ float first = r1 * r2; //real
+ float outer = r1 * i2; //imaginary
+ float inner = i1 * r2; //imaginary
+ float last = -(i1 * i2); //real
+ r1 = first + last;
+ i1 = outer + inner;
+
+ ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
+ ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
+ ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
+ ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
+}
+