From: miha-q <>
Date: Fri, 8 Mar 2024 02:22:15 +0000 (-0500)
Subject: Thu Mar  7 09:22:15 PM EST 2024
X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=0e138ded8a218544578d983270edd649fae095c6;p=QAnsel.git

Thu Mar  7 09:22:15 PM EST 2024
---

diff --git a/examples/slow.txt b/examples/slow.txt
index f4126d6..ecdb1f1 100644
--- a/examples/slow.txt
+++ b/examples/slow.txt
@@ -1,5 +1,5 @@
 //designed to be slow
-qreg q[14];
+qreg q[11];
 x q[0];
 x q[1];
 x q[2];
@@ -11,8 +11,8 @@ x q[7];
 x q[8];
 x q[9];
 x q[10];
-x q[11];
-x q[12];
-x q[13];
+//x q[11];
+//x q[12];
+//x q[13];
 
-born;
\ No newline at end of file
+//born;
\ No newline at end of file
diff --git a/src/QAnsel.c b/src/QAnsel.c
index 851ec99..a7ae070 100644
--- a/src/QAnsel.c
+++ b/src/QAnsel.c
@@ -10,13 +10,17 @@
 unsigned char HIDDEN_VARIABLE = 0;
 FILE* RANDOM_FILE;
 #define GPU_ENABLED
-unsigned char USE_GPU = 1;
 unsigned char USE_THREADS = 1;
-#define SPEED_TEST
+#define MODE_BARE 1
+#define MODE_THREADED 2
+#define MODE_METAL 3
+#define MODE_METAL_THREADED 4
+unsigned char MODE = MODE_BARE;
+//#define SPEED_TEST
 
 typedef struct
 {
-	char n[128];//2082378
+	char n[128];
 	unsigned char q0, q1, q2;
 	float arg0, arg1, arg2;
 } QInstr;
@@ -223,22 +227,6 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr
 		cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		us2 = get_time();
 		printf("\tMetal2x2: %lu\n", us2 - us1);
-
-		us1 = get_time();
-		for (int i = 0; i < filter.rows; i++)
-		{
-			for (int j = 0; j < filter.cols; j++)
-			{
-				int x = (j * 2) + (i * filter.cols * 2);
-				int y = (i * 2) + (j * filter.cols * 2);
-				filter.ptr[x] = filter.ptr[y];
-				filter.ptr[x + 1] = filter.ptr[y + 1];
-			}
-		}
-		us2 = get_time();
-		printf("\tTranspose: %lu\n", us2 - us1);
-
-
 		us1 = get_time();
 		cpx_mtx_knk_threads(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		us2 = get_time();
@@ -261,11 +249,11 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr
 		//us2 = get_time();
 		//printf("\tTranspose: %lu\n", us2 - us1);
 		#else
-		if (USE_GPU && tmp.rows >= 512)
+		if (MODE == MODE_METAL && tmp.cols >= 64)
 		{
 			cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		}
-		else if (USE_THREADS && tmp.rows >= 512)
+		else if ((MODE == MODE_THREADED || MODE == MODE_METAL_THREADED) && tmp.cols >= 64)
 		{
 			cpx_mtx_knk_threads_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		}
@@ -299,11 +287,11 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr
 	us2 = get_time();
 	printf("\tBare: %lu\n", us2 - us1);
 	#else
-	if (USE_GPU && tmp.cols >= 64)
+	if ((MODE == MODE_METAL || MODE == MODE_METAL_THREADED) && tmp.cols >= 64)
 	{
 		cpx_mtx_dot_metal(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols);
 	}
-	else if (USE_THREADS && tmp.cols >= 512)
+	else if (MODE == MODE_THREADED && tmp.cols >= 64)
 	{
 		cpx_mtx_dot_threads(tmp.ptr, stateVector->ptr, filter.ptr, stateVector->rows, stateVector->cols, filter.rows, filter.cols);
 	}
@@ -1434,12 +1422,15 @@ void process(int argc, char** argv)
 
 void main(int argc, char** argv)
 {
-	USE_GPU = cpx_mtx_begin();
+	MODE = MODE_METAL;
+	unsigned char err = cpx_mtx_begin();
+	if (err == 0 && (MODE == MODE_METAL_THREADED || MODE == MODE_METAL))
+	{
+		MODE = MODE == MODE_METAL_THREADED ? MODE_THREADED : MODE_BARE;
+	}
 	RANDOM_FILE = fopen("/dev/TrueRNG0", "r");
 	if (!RANDOM_FILE) RANDOM_FILE = fopen("/dev/random", "r");
-	USE_GPU = 0;
-	USE_THREADS = 1;
 	process(argc, argv);
 	fclose(RANDOM_FILE);
-	if (USE_GPU) cpx_mtx_clean();
+	if (MODE_METAL || MODE_METAL_THREADED) cpx_mtx_clean();
 }
\ No newline at end of file
diff --git a/src/complex.c b/src/complex.c
index 722eef4..f9a57b7 100644
--- a/src/complex.c
+++ b/src/complex.c
@@ -575,6 +575,57 @@ void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col
 	err = clReleaseMemObject(memR); gpuerr(err);
 }
 
+/*typedef struct
+{
+	float* ptr;
+	cl_mem* buff;
+	size_t* buff_size;
+	size_t offset;
+	size_t count;
+} cpx_copy_context;
+
+void* cpx_copy_run(void *context)
+{
+	cpx_copy_context* ccc = (cpx_copy_context*)context;
+	cl_int err = clEnqueueWriteBuffer(cpx_mtx_command_queue, *(ccc->buff), CL_FALSE, ccc->offset, ccc->count, ccc->ptr, 0, NULL, NULL);
+	gpuerr(err);
+}
+
+void cpx_copy(float* ptr, cl_mem* buff, size_t* buff_size)
+{
+	int delimeter = (int)(*buff_size);
+	int cores = get_core_count();
+	int threadCount = cores;
+	if (threadCount > delimeter) threadCount = delimeter;
+	int delimetersPerThread = delimeter / threadCount;
+	int leftOvers = delimeter % threadCount;
+
+	cpx_copy_context ctxs[threadCount];
+    pthread_t threads[threadCount];
+
+	for (int i = 0; i < threadCount; i++)
+	{
+		ctxs[i].ptr = ptr;
+		ctxs[i].buff = buff;
+		ctxs[i].buff_size = buff_size;
+		ctxs[i].offset = i * delimetersPerThread;
+		ctxs[i].count = delimetersPerThread + ((i == threadCount - 1) ? leftOvers : 0);
+        if (pthread_create(&(threads[i]), NULL, &cpx_copy_run, (void*)&(ctxs[i])))
+        {
+            fprintf(stderr, "QAnsel: Thread error. (3)\n");
+            exit(1);
+        }
+	}
+    for (uint32_t i = 0; i < threadCount; i++)
+    {
+        if (pthread_join(threads[i], NULL))
+        {
+            fprintf(stderr, "QAnsel: Thread error. (4)\n");
+        }
+    }
+	clFlush(cpx_mtx_command_queue);
+}*/
+
 void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
 {
 	int rowsR = rowsA * rowsB;
@@ -594,8 +645,7 @@ void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int col
     gpuerr(err);
 	err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
     gpuerr(err);
-	printf("%lu!!!!\n", get_time() - q);
-
+	
 	//Load and compile program
 	cl_program program;
 	if (cpx_mtx_cache == NULL)