unsigned char HIDDEN_VARIABLE = 0;
FILE* RANDOM_FILE;
#define GPU_ENABLED
-unsigned char USE_GPU = 1;
unsigned char USE_THREADS = 1;
-#define SPEED_TEST
+#define MODE_BARE 1
+#define MODE_THREADED 2
+#define MODE_METAL 3
+#define MODE_METAL_THREADED 4
+unsigned char MODE = MODE_BARE;
+//#define SPEED_TEST
typedef struct
{
- char n[128];//2082378
+ char n[128];
unsigned char q0, q1, q2;
float arg0, arg1, arg2;
} QInstr;
cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
us2 = get_time();
printf("\tMetal2x2: %lu\n", us2 - us1);
-
- us1 = get_time();
- for (int i = 0; i < filter.rows; i++)
- {
- for (int j = 0; j < filter.cols; j++)
- {
- int x = (j * 2) + (i * filter.cols * 2);
- int y = (i * 2) + (j * filter.cols * 2);
- filter.ptr[x] = filter.ptr[y];
- filter.ptr[x + 1] = filter.ptr[y + 1];
- }
- }
- us2 = get_time();
- printf("\tTranspose: %lu\n", us2 - us1);
-
-
us1 = get_time();
cpx_mtx_knk_threads(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
us2 = get_time();
//us2 = get_time();
//printf("\tTranspose: %lu\n", us2 - us1);
#else
- if (USE_GPU && tmp.rows >= 512)
+ if (MODE == MODE_METAL && tmp.cols >= 64)
{
cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
}
- else if (USE_THREADS && tmp.rows >= 512)
+ else if ((MODE == MODE_THREADED || MODE == MODE_METAL_THREADED) && tmp.cols >= 64)
{
cpx_mtx_knk_threads_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
}
us2 = get_time();
printf("\tBare: %lu\n", us2 - us1);
#else
- if (USE_GPU && tmp.cols >= 64)
+ if ((MODE == MODE_METAL || MODE == MODE_METAL_THREADED) && tmp.cols >= 64)
{
cpx_mtx_dot_metal(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols);
}
- else if (USE_THREADS && tmp.cols >= 512)
+ else if (MODE == MODE_THREADED && tmp.cols >= 64)
{
cpx_mtx_dot_threads(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols);
}
void main(int argc, char** argv)
{
- USE_GPU = cpx_mtx_begin();
+ MODE = MODE_METAL;
+ unsigned char err = cpx_mtx_begin();
+ if (err == 0 && (MODE == MODE_METAL_THREADED || MODE == MODE_METAL))
+ {
+ MODE = MODE == MODE_METAL_THREADED ? MODE_THREADED : MODE_BARE;
+ }
RANDOM_FILE = fopen("/dev/TrueRNG0", "r");
if (!RANDOM_FILE) RANDOM_FILE = fopen("/dev/random", "r");
- USE_GPU = 0;
- USE_THREADS = 1;
process(argc, argv);
fclose(RANDOM_FILE);
- if (USE_GPU) cpx_mtx_clean();
+ if (MODE_METAL || MODE_METAL_THREADED) cpx_mtx_clean();
}
\ No newline at end of file
err = clReleaseMemObject(memR); gpuerr(err);
}
+/*typedef struct
+{
+ float* ptr;
+ cl_mem* buff;
+ size_t* buff_size;
+ size_t offset;
+ size_t count;
+} cpx_copy_context;
+
+void* cpx_copy_run(void *context)
+{
+ cpx_copy_context* ccc = (cpx_copy_context*)context;
+ cl_int err = clEnqueueWriteBuffer(cpx_mtx_command_queue, *(ccc->buff), CL_FALSE, ccc->offset, ccc->count, ccc->ptr, 0, NULL, NULL);
+ gpuerr(err);
+}
+
+void cpx_copy(float* ptr, cl_mem* buff, size_t* buff_size)
+{
+ int delimeter = (int)(*buff_size);
+ int cores = get_core_count();
+ int threadCount = cores;
+ if (threadCount > delimeter) threadCount = delimeter;
+ int delimetersPerThread = delimeter / threadCount;
+ int leftOvers = delimeter % threadCount;
+
+ cpx_copy_context ctxs[threadCount];
+ pthread_t threads[threadCount];
+
+ for (int i = 0; i < threadCount; i++)
+ {
+ ctxs[i].ptr = ptr;
+ ctxs[i].buff = buff;
+ ctxs[i].buff_size = buff_size;
+ ctxs[i].offset = i * delimetersPerThread;
+ ctxs[i].count = delimetersPerThread + ((i == threadCount - 1) ? leftOvers : 0);
+ if (pthread_create(&(threads[i]), NULL, &cpx_copy_run, (void*)&(ctxs[i])))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (3)\n");
+ exit(1);
+ }
+ }
+ for (uint32_t i = 0; i < threadCount; i++)
+ {
+ if (pthread_join(threads[i], NULL))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (4)\n");
+ }
+ }
+ clFlush(cpx_mtx_command_queue);
+}*/
+
void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
int rowsR = rowsA * rowsB;
gpuerr(err);
err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
gpuerr(err);
- printf("%lu!!!!\n", get_time() - q);
-
+
//Load and compile program
cl_program program;
if (cpx_mtx_cache == NULL)