tmp.rows = filter.rows * gate.rows;
tmp.cols = filter.cols * gate.cols;
- //tmp.ptr = malloc((tmp.rows * 2) * (tmp.cols * 2) * sizeof(float));
tmp.ptr = malloc(tmp.rows * (tmp.cols * 2) * sizeof(float));
#ifdef GPU_ENABLED
if (USE_GPU/* && (tmp.rows >= 512 || tmp.cols >= 512)*/)
{
- cpx_mtx_knk
- (
- tmp.ptr, filter.ptr, gate.ptr,
- tmp.rows, tmp.cols,
- filter.rows, filter.cols,
- gate.rows, gate.cols
- );
- //cpx_ncpx_knk_mt
- //(
- // tmp.ptr, tmp.rows, tmp.cols,
- // filter.ptr, filter.rows, filter.cols,
- // gate.ptr, gate.rows, gate.cols
- //);
+ cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
}
else
{
- cpx_mtx_knk
- (
- tmp.ptr, filter.ptr, gate.ptr,
- tmp.rows, tmp.cols,
- filter.rows, filter.cols,
- gate.rows, gate.cols
- );
+ cpx_mtx_knk(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
}
#else
- //cpx_ncpx_knk_mt
- //(
- // tmp.ptr, tmp.rows, tmp.cols,
- // filter.ptr, filter.rows, filter.cols,
- // gate.ptr, gate.rows, gate.cols
- //);
cpx_mtx_knk
(
tmp.ptr, filter.ptr, gate.ptr,
}
}
-void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB)
+void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
+ int rowsR = rowsA * rowsB;
+ int colsR = colsA * colsB;
for (int i = 0; i < rowsR; i++)
{
GPU_GLOBAL_ID_0 = i;
err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
}
-void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB)
+void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
+ int rowsR = rowsA * rowsB;
+ int colsR = colsA * colsB;
//Create buffers
size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);