+++ /dev/null
-__kernel void kernel_dot
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsA,
- const int colsA,
- const int rowsB,
- const int colsB
-)
-{
- const int rowsR = rowsA;
- const int colsR = colsB;
- const int rowR = get_global_id(0); //{gpu_only}
- const int colR = get_global_id(1); //{gpu_only}
-
- float rR = 0;
- float iR = 0;
-
- for (int i = 0; i < colsA; i++)
- {
- const float rA = ptrA[(size_t)rowR * ((size_t)colsA * (size_t)2) + ((size_t)i * (size_t)2)];
- const float iA = ptrA[(size_t)rowR * ((size_t)colsA * (size_t)2) + ((size_t)i * (size_t)2) + (size_t)1];
- const float rB = ptrB[(size_t)i * ((size_t)colsB * (size_t)2) + ((size_t)colR * (size_t)2)];
- const float iB = ptrB[(size_t)i * ((size_t)colsB * (size_t)2) + ((size_t)colR * (size_t)2) + (size_t)1];
-
- //(rA + iA)(rB + iB)
- const float first = rA * rB;
- const float outer = rA * iB;
- const float inner = iA * rB;
- const float lasts = iA * iB;
-
- rR += first + lasts;
- iR += outer + inner;
- }
- ptrR[(size_t)rowR * ((size_t)colsR * (size_t)2) + ((size_t)colR * (size_t)2)] = rR;
- ptrR[(size_t)rowR * ((size_t)colsR * (size_t)2) + ((size_t)colR * (size_t)2) + (size_t)1] = iR;
-}
-
-__kernel void kernel_knk
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsA,
- const int colsA,
- const int rowsB,
- const int colsB
-)
-{
- const int rowsR = rowsA * rowsB;
- const int colsR = colsA * colsB;
- const int rowR = get_global_id(0); //{gpu_only}
- for (int colR = 0; colR < colsR; colR++)
- {
- const int rowA = rowR / rowsB;
- const int colA = colR / colsB;
- const int rowB = rowR % rowsB;
- const int colB = colR % colsB;
-
- const int posA = rowA * (colsA * 2) + (colA * 2);
- const int posB = rowB * (colsB * 2) + (colB * 2);
-
- const float rA = ptrA[posA];
- const float iA = ptrA[posA + 1];
- const float rB = ptrB[posB];
- const float iB = ptrB[posB + 1];
-
- //(rA + iA)(rB + iB)
- const float first = rA * rB;
- const float outer = rA * iB;
- const float inner = iA * rB;
- const float lasts = iA * iB;
- ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts;
- ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner;
- }
-}
-
-__kernel void kernel_knk_2x2
-(
- __global float* ptrR,
- __global float* ptrA,
- const int rowsA,
- const int colsA,
- const float gate0,
- const float gate1,
- const float gate2,
- const float gate3,
- const float gate4,
- const float gate5,
- const float gate6,
- const float gate7
-)
-{
- const int rowsR = rowsA * 2;
- const int colsR = colsA * 2;
- const int rowR = get_global_id(0) * 2; //{gpu_only}
-
- for (int colR = 0; colR < colsR; colR += 2)
- {
- const int rowA = rowR / 2;
- const int colA = colR / 2;
- const float rA = ptrA[(size_t)rowA * ((size_t)colsA * (size_t)2) + ((size_t)colA * (size_t)2)];
- const float iA = ptrA[(size_t)rowA * ((size_t)colsA * (size_t)2) + ((size_t)colA * (size_t)2) + (size_t)1];
-
- for (int i = 0; i < 4; i++)
- {
- float rB, iB;
- switch (i)
- {
- case 0: rB = gate0; iB = gate1; break;
- case 1: rB = gate2; iB = gate3; break;
- case 2: rB = gate4; iB = gate5; break;
- case 3: rB = gate6; iB = gate7; break;
- }
-
- //(rA + iA)(rB + iB)
- const float first = rA * rB;
- const float outer = rA * iB;
- const float inner = iA * rB;
- const float lasts = iA * iB;
- switch (i)
- {
- case 0:
- ptrR[((size_t)rowR + (size_t)0) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)0) * (size_t)2)] = first + lasts;
- ptrR[((size_t)rowR + (size_t)0) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)0) * (size_t)2) + (size_t)1] = outer + inner;
- break;
- case 1:
- ptrR[((size_t)rowR + (size_t)0) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)1) * (size_t)2)] = first + lasts;
- ptrR[((size_t)rowR + (size_t)0) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)1) * (size_t)2) + (size_t)1] = outer + inner;
- break;
- case 2:
- ptrR[((size_t)rowR + (size_t)1) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)0) * (size_t)2)] = first + lasts;
- ptrR[((size_t)rowR + (size_t)1) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)0) * (size_t)2) + (size_t)1] = outer + inner;
- break;
- case 3:
- ptrR[((size_t)rowR + (size_t)1) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)1) * (size_t)2)] = first + lasts;
- ptrR[((size_t)rowR + (size_t)1) * ((size_t)colsR * (size_t)2) + (((size_t)colR + (size_t)1) * (size_t)2) + (size_t)1] = outer + inner;
- break;
- }
-
- }
- }
-}