#include <pthread.h>
#include <string.h>
#include "cores.c"
-#define GPU_ENABLED
typedef struct
{
float real, imaginary;
}
}
-//This is for testing GPU functions on the CPU
#define __kernel
#define __global
int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2;
}
}
-#ifdef GPU_ENABLED
+/*THREADED*/
+
+/*-----------------------------------------------------------------------------------*/
+
+/*METAL*/
+/*-----------------------------------------------------------------------------------*/
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#define CL_TARGET_OPENCL_VERSION 300
#include <CL/cl.h>
cl_command_queue cpx_mtx_command_queue;
unsigned char* cpx_mtx_cache = NULL;
size_t cpx_mtx_cache_len = 0;
-#endif
uint8_t cpx_mtx_begin()
{
- #ifdef GPU_ENABLED
cl_uint count;
cl_int err;
fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
return 0;
}
- #endif
return 1;
}
void cpx_mtx_clean()
{
- #ifdef GPU_ENABLED
cl_int err;
err = clReleaseCommandQueue(cpx_mtx_command_queue);
if (err != CL_SUCCESS)
fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
}
free(cpx_mtx_cache);
- #endif
}
-#ifdef GPU_ENABLED
void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared)
{
int colsA = shared;
err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg);
//Run the program
- size_t work_size[] = {rowsR, colsR};
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
+ size_t work_size[] = {rowsR};
+ err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, work_size, NULL, 0, NULL, NULL);
gpuerr(clEnqueueNDRangeKernel);
//Wait for completion
err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
}
-#endif
+/*-----------------------------------------------------------------------------------*/
+
+
+
-#endif
+#endif
\ No newline at end of file
)
{
int rowR = get_global_id(0);
- int colR = get_global_id(1);
-
- int rowA = rowR / rowsB;
- int colA = colR / colsB;
- int rowB = rowR % rowsB;
- int colB = colR % colsB;
+ for (int colR = 0; colR < colsR; colR++)
+ {
+ int rowA = rowR / rowsB;
+ int colA = colR / colsB;
+ int rowB = rowR % rowsB;
+ int colB = colR % colsB;
- int posA = rowA * (colsA * 2) + (colA * 2);
- int posB = rowB * (colsB * 2) + (colB * 2);
+ int posA = rowA * (colsA * 2) + (colA * 2);
+ int posB = rowB * (colsB * 2) + (colB * 2);
- float rA = ptrA[posA];
- float iA = ptrA[posA + 1];
- float rB = ptrB[posB];
- float iB = ptrB[posB + 1];
+ float rA = ptrA[posA];
+ float iA = ptrA[posA + 1];
+ float rB = ptrB[posB];
+ float iB = ptrB[posB + 1];
- //(rA + iA)(rB + iB)
- float first = rA * rB;
- float outer = rA * iB;
- float inner = iA * rB;
- float lasts = iA * iB;
- ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts;
- ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner;
+ //(rA + iA)(rB + iB)
+ float first = rA * rB;
+ float outer = rA * iB;
+ float inner = iA * rB;
+ float lasts = iA * iB;
+ ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts;
+ ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner;
+ }
}