From: miha-q <>
Date: Thu, 7 Mar 2024 23:16:15 +0000 (-0500)
Subject: Thu Mar  7 06:16:15 PM EST 2024
X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=169afd37e117e83c47804895e4e25cf4728b8d04;p=QAnsel.git

Thu Mar  7 06:16:15 PM EST 2024
---

diff --git a/src/QAnsel.c b/src/QAnsel.c
index 5f4ef13..ac7e82b 100644
--- a/src/QAnsel.c
+++ b/src/QAnsel.c
@@ -223,23 +223,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr
 		cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
 		us2 = get_time();
 		printf("\tMetal2x2: %lu\n", us2 - us1);
-		us1 = get_time();
-		cpx_mtx_knk_metal_2x2_R(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
-		us2 = get_time();
-		printf("\tMetal2x2_R: %lu\n", us2 - us1);
 
-		if (filter.rows / 1024 == 0)
-		{
-			printf("\tMetal2x2_Rx4: Invalid\n");
-		}
-		else
-		{
-			us1 = get_time();
-			cpx_mtx_knk_metal_2x2_Rx4(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols);
-			us2 = get_time();
-			printf("\tMetal2x2_Rx4: %lu\n", us2 - us1);
-		}
-		
 		us1 = get_time();
 		for (int i = 0; i < filter.rows; i++)
 		{
diff --git a/src/complex.c b/src/complex.c
index 79e1e42..02f8825 100644
--- a/src/complex.c
+++ b/src/complex.c
@@ -144,37 +144,13 @@ void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, in
     }
 }
 
-
 void cpx_mtx_knk_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
 {
 	int rowsR = rowsA * rowsB;
 	int colsR = colsA * colsB;
     for (int i = 0; i < rowsR / 2; i++)
     {
-		for (int j = 0; j < colsR / 2; j++)
-		{
-			kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i, j);
-		}
-    }
-}
-
-void cpx_mtx_knk_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
-	int rowsR = rowsA * rowsB;
-	int colsR = colsA * colsB;
-    for (int i = 0; i < rowsR / 2; i++)
-    {
-			kernel_knk_2x2_R(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
-    }
-}
-
-void cpx_mtx_knk_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
-	int rowsR = rowsA * rowsB;
-	int colsR = colsA * colsB;
-    for (int i = 0; i < rowsR / (2 * 1024); i++)
-    {
-			kernel_knk_2x2_Rx4(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
+			kernel_knk_2x2(ptrR, ptrA, rowsA, colsA, ptrB[0], ptrB[1], ptrB[2], ptrB[3], ptrB[4], ptrB[5], ptrB[6], ptrB[7], i);
     }
 }
 
@@ -632,109 +608,6 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int
 
 	//Setup kernel
 
-	float gate0 = ptrB[0];
-	float gate1 = ptrB[1];
-	float gate2 = ptrB[2];
-	float gate3 = ptrB[3];
-	float gate4 = ptrB[4];
-	float gate5 = ptrB[5];
-	float gate6 = ptrB[6];
-	float gate7 = ptrB[7];
-	cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2", &err); gpuerr(clCreateKernel);
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg);
-
-	//Run the program
-	err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, (size_t[]){rowsR / 2, colsR / 2}, NULL, 0, NULL, NULL);
-	gpuerr(clEnqueueNDRangeKernel);
-
-	//Wait for completion
-	err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
-	err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
-
-	//Read results
-	err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
-	gpuerr(clEnqueueReadBuffer);
-
-	//Clean up
-	err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
-	err = clReleaseProgram(program); gpuerr(clReleaseProgram);
-	err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
-	err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-
-//This only works if ptrA is NxM where both N and X are divisible by two,
-//	and ptrB is 2x2. If both are true, this is much more efficient than
-//	the standard knk_metal() function. 
-void cpx_mtx_knk_metal_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
-	int rowsR = rowsA * rowsB;
-	int colsR = colsA * colsB;
-	//Create buffers
-	size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
-	size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
-	cl_int err;
-	cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer);
-	cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer);
-	
-	//Populate buffers
-	err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
-    gpuerr(clEnqueueWriteBuffer);
-
-	//Load and compile program
-	cl_program program;
-	if (cpx_mtx_cache == NULL)
-	{
-		program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err);
-        gpuerr(clCreateProgramWithSource);
-		err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
-		if (err != CL_SUCCESS)
-		{
-			fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
-			size_t log_size;
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-			char* log = malloc(log_size);
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-			printf("%s", log);
-			free(log);
-			exit(1);
-		}
-		err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
-        gpuerr(clGetProgramInfo);
-		cpx_mtx_cache = malloc(cpx_mtx_cache_len);
-		clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
-        gpuerr(clGetProgramInfo);
-	}
-	else
-	{
-		program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
-		gpuerr(clCreateProgramWithBinary);
-		err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
-		if (err != CL_SUCCESS)
-		{
-			fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
-			size_t log_size;
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-			char* log = malloc(log_size);
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-			printf("%s", log);
-			free(log);
-			exit(1);
-		}
-	}
-
-	//Setup kernel
-
 	float gate0 = ptrB[0];
 	float gate1 = ptrB[1];
 	float gate2 = ptrB[2];
@@ -777,113 +650,4 @@ void cpx_mtx_knk_metal_2x2_R(float* ptrR, float* ptrA, float* ptrB, int rowsA, i
 }
 
 
-//This only works if ptrA is NxM where both N and X are divisible by two,
-//	and ptrB is 2x2. If both are true, this is much more efficient than
-//	the standard knk_metal() function. 
-void cpx_mtx_knk_metal_2x2_Rx4(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
-{
-	int rowsR = rowsA * rowsB;
-	int colsR = colsA * colsB;
-	//Create buffers
-	size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
-	size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
-	cl_int err;
-	cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer);
-	cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer);
-	
-	//Populate buffers
-	err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
-    gpuerr(clEnqueueWriteBuffer);
-
-	//Load and compile program
-	cl_program program;
-	if (cpx_mtx_cache == NULL)
-	{
-		program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err);
-        gpuerr(clCreateProgramWithSource);
-		err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
-		if (err != CL_SUCCESS)
-		{
-			fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
-			size_t log_size;
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-			char* log = malloc(log_size);
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-			printf("%s", log);
-			free(log);
-			exit(1);
-		}
-		err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
-        gpuerr(clGetProgramInfo);
-		cpx_mtx_cache = malloc(cpx_mtx_cache_len);
-		clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
-        gpuerr(clGetProgramInfo);
-	}
-	else
-	{
-		program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
-		gpuerr(clCreateProgramWithBinary);
-		err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
-		if (err != CL_SUCCESS)
-		{
-			fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
-			size_t log_size;
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-			char* log = malloc(log_size);
-			clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-			printf("%s", log);
-			free(log);
-			exit(1);
-		}
-	}
-
-	//Setup kernel
-
-	float gate0 = ptrB[0];
-	float gate1 = ptrB[1];
-	float gate2 = ptrB[2];
-	float gate3 = ptrB[3];
-	float gate4 = ptrB[4];
-	float gate5 = ptrB[5];
-	float gate6 = ptrB[6];
-	float gate7 = ptrB[7];
-	cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2_Rx4", &err); gpuerr(clCreateKernel);
-	err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg);
-	err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg);
-
-	//Run the program
-	err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / (2 * 1024)}, NULL, 0, NULL, NULL);
-	gpuerr(clEnqueueNDRangeKernel);
-
-	//Wait for completion
-	err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
-	err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
-
-	//Read results
-	err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
-	gpuerr(clEnqueueReadBuffer);
-
-	//Clean up
-	err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
-	err = clReleaseProgram(program); gpuerr(clReleaseProgram);
-	err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
-	err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-
-
-/*-----------------------------------------------------------------------------------*/
-
-
-
-
 #endif
\ No newline at end of file
diff --git a/src/kernel.cl b/src/kernel.cl
index 12646ed..d11e287 100644
--- a/src/kernel.cl
+++ b/src/kernel.cl
@@ -87,78 +87,6 @@ __kernel void kernel_knk
 }
 
 __kernel void kernel_knk_2x2
-(
-    __global float* ptrR,
-    __global float* ptrA,
-    const int rowsA,
-    const int colsA,
-    const float gate0,
-    const float gate1,
-    const float gate2,
-    const float gate3,
-    const float gate4,
-    const float gate5,
-    const float gate6,
-    const float gate7
-    , const int get_global_id_0 //{cpu_only}
-    , const int get_global_id_1 //{cpu_only}
-)
-{
-    const int rowsR = rowsA * 2;
-    const int colsR = colsA * 2;
-    const int rowR = get_global_id(0) * 2; //{gpu_only}
-    const int colR = get_global_id(1) * 2; //{gpu_only}
-    const int rowR = get_global_id_0 * 2; //{cpu_only}
-    const int colR = get_global_id_1 * 2; //{cpu_only}
-
-    const int rowA = rowR / 2;
-    const int colA = colR / 2;
-    const int posA = rowA * (colsA * 2) + (colA * 2);
-    const float rA = ptrA[posA];
-    const float iA = ptrA[posA + 1];
-
-    for (int i = 0; i < 4; i++)
-    {
-        float rB, iB;
-        switch (i)
-        {
-            case 0: rB = gate0; iB = gate1; break;
-            case 1: rB = gate2; iB = gate3; break;
-            case 2: rB = gate4; iB = gate5; break;
-            case 3: rB = gate6; iB = gate7; break;
-        }
-
-        //(rA + iA)(rB + iB)
-        const float first = rA * rB;
-        const float outer = rA * iB;
-        const float inner = iA * rB;
-        const float lasts = iA * iB;
-        switch (i)
-        {
-            case 0:
-                ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
-                ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
-            break;
-            case 1:
-                ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
-                ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
-            break;
-            case 2:
-                ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
-                ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
-            break;
-            case 3:
-                ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
-                ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
-            break;
-        }
-
-    }
-
-}
-
-
-__kernel void kernel_knk_2x2_R
 (
     __global float* ptrR,
     __global float* ptrA,
@@ -227,77 +155,3 @@ __kernel void kernel_knk_2x2_R
         }
     }
 }
-
-
-__kernel void kernel_knk_2x2_Rx4
-(
-    __global float* ptrR,
-    __global float* ptrA,
-    const int rowsA,
-    const int colsA,
-    const float gate0,
-    const float gate1,
-    const float gate2,
-    const float gate3,
-    const float gate4,
-    const float gate5,
-    const float gate6,
-    const float gate7
-    , const int get_global_id_0 //{cpu_only}
-)
-{
-    const int rowsR = rowsA * 2;
-    const int colsR = colsA * 2;
-    const int block = get_global_id(0) * 2 * 1024; //{gpu_only}
-    const int block = get_global_id_0 * 2 * 1024; //{cpu_only}
-
-    for (int rowR = block; rowR < block + 2 * 1024; rowR += 2)
-    {
-        for (int colR = 0; colR < colsR; colR += 2)
-        {
-            const int rowA = rowR / 2;
-            const int colA = colR / 2;
-            const int posA = rowA * (colsA * 2) + (colA * 2);
-            const float rA = ptrA[posA];
-            const float iA = ptrA[posA + 1];
-
-            for (int i = 0; i < 4; i++)
-            {
-                float rB, iB;
-                switch (i)
-                {
-                    case 0: rB = gate0; iB = gate1; break;
-                    case 1: rB = gate2; iB = gate3; break;
-                    case 2: rB = gate4; iB = gate5; break;
-                    case 3: rB = gate6; iB = gate7; break;
-                }
-
-                //(rA + iA)(rB + iB)
-                const float first = rA * rB;
-                const float outer = rA * iB;
-                const float inner = iA * rB;
-                const float lasts = iA * iB;
-                switch (i)
-                {
-                    case 0:
-                        ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
-                        ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
-                    break;
-                    case 1:
-                        ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
-                        ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
-                    break;
-                    case 2:
-                        ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts;
-                        ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner;
-                    break;
-                    case 3:
-                        ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts;
-                        ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner;
-                    break;
-                }
-
-            }
-        }
-    }
-}
\ No newline at end of file