all:
- #gpu mmul
- mv src/kernel2.cl src/.kernel2.cl
- bash -c 'echo -ne "$$(cat src/.kernel2.cl)\x00" > src/kernel2.cl'
- xxd -i src/kernel2.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/kernel2.cl.c
- mv src/.kernel2.cl src/kernel2.cl
-
- #gpu mmul
- #mv src/gpu/kernel.cl src/gpu/.kernel.cl
- #bash -c 'echo -ne "$$(cat src/gpu/.kernel.cl)\x00" > src/gpu/kernel.cl'
- #xxd -i src/gpu/kernel.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/gpu/kernel.cl.c
- #mv src/gpu/.kernel.cl src/gpu/kernel.cl
+ mv src/kernel.cl src/.kernel.cl
+ bash -c 'echo -ne "$$(cat src/.kernel.cl)\x00" > src/kernel.cl'
+ xxd -i src/kernel.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/kernel.cl.c
+ mv src/.kernel.cl src/kernel.cl
gcc src/QAnsel.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread
- rm -f src/gpu/*.cl.c
+ rm -f src/*.cl.c
#include <stdlib.h>
#include <math.h>
#include <time.h>
-#include "complex2.c"
-#include "gates2.c"
+#include "complex.c"
+#include "gates.c"
#include "display.c"
#include "chacha20.c"
#define QUBITS_MAX 14
#include <pthread.h>
#include <string.h>
#include "cores.c"
-
+#define GPU_ENABLED
typedef struct
{
float real, imaginary;
typedef struct
{
float *ptr;
- size_t rows, cols;
+ int rows, cols;
} cpx_mtx_t;
uint8_t* cpx_str(cpx_t* n)
{
uint8_t* r;
- size_t z;
+ int z;
float rl = n->real;
float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary);
return n;
}
-void cpx_add(cpx_t* r, cpx_t* a, cpx_t* b)
-{
- r->real = a->real + b->real;
- r->imaginary = a->imaginary + b->imaginary;
-}
-
-void cpx_sub(cpx_t* r, cpx_t* a, cpx_t* b)
-{
- r->real = a->real - b->real;
- r->imaginary = a->imaginary - b->imaginary;
-}
-
void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b)
{
//FOIL
r->imaginary = outer + inner;
}
-//non-complex matrix multiply
-// shared = colsA = rowsB
-void cpx_ncpx_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
-{
- size_t colsA = shared;
- size_t rowsB = shared;
- size_t rowsR = rowsA;
- size_t colsR = colsB;
- for (size_t rowR = 0; rowR < rowsR; rowR++)
- {
- for (size_t colR = 0; colR < colsR; colR++)
- {
- size_t posR = colR + rowR * colsR;
- size_t rowA = rowR;
- size_t colB = colR;
- ptrR[posR] = 0;
- for (size_t i = 0; i < shared; i++)
- {
- size_t posA = i + rowA * colsA;
- size_t posB = colB + i * colsB;
- ptrR[posR] += ptrA[posA] * ptrB[posB];
- }
- }
- }
-}
-
-typedef struct
-{
- size_t ID;
- size_t Threads;
- size_t Last;
- size_t Loops;
- size_t Continue;
- size_t BlockSize;
- float* ptrR;
- float* ptrA;
- float* ptrB;
- size_t rowsA;
- size_t colsB;
- size_t shared;
-} cpx_mul_shared;
-
-void* cpx_ncpx_mmul_mtc(void *context)
-{
- cpx_mul_shared* data = (cpx_mul_shared*)context;
- float* ptrR = data->ptrR;
- float* ptrA = data->ptrA;
- float* ptrB = data->ptrB;
- size_t rowsA = data->rowsA;
- size_t colsB = data->colsB;
- size_t shared = data->shared;
-
- size_t colsA = data->shared;
- size_t rowsB = data->shared;
- size_t rowsR = data->rowsA;
- size_t colsR = data->colsB;
-
- for (size_t rowR = 0; rowR < rowsR; rowR++)
- {
- size_t a = data->ID * data->BlockSize;
- size_t b = (data->ID + 1) * data->BlockSize;
- if (data->ID == data->Last) b += data->Continue;
-
- //printf("%i;%i\n", a, b);
-
- for (size_t colR = a; colR < b; colR++)
- {
- size_t posR = colR + rowR * colsR;
- size_t rowA = rowR;
- size_t colB = colR;
- ptrR[posR] = 0;
- for (size_t i = 0; i < data->shared; i++)
- {
- size_t posA = i + rowA * colsA;
- size_t posB = colB + i * colsB;
- data->ptrR[posR] += data->ptrA[posA] * data->ptrB[posB];
- }
- }
- }
-}
-
-void cpx_ncpx_mmul_mt(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
-{
- cpx_mul_shared share;
- share.Threads = get_core_count();
- share.ptrR = ptrR;
- share.ptrA = ptrA;
- share.ptrB = ptrB;
- share.rowsA = rowsA;
- share.colsB = colsB;
- share.shared = shared;
- if (colsB <= share.Threads)
- {
- share.Threads = colsB;
- }
- share.BlockSize = (size_t)floor(((float)colsB) / ((float)share.Threads));
- share.Loops = (size_t)floor(((float)colsB) / ((float)share.BlockSize));
- share.Last = share.Loops - 1;
- share.Continue = (size_t)(((float)colsB) - ((float)share.Loops) * ((float)share.BlockSize));
-
- pthread_t threads[share.Loops];
- cpx_mul_shared contexts[share.Loops];
- for (size_t i = 0; i < share.Loops; i++)
- {
- pthread_t tid;
- threads[i] = tid;
- memcpy(contexts + i, &share, sizeof(cpx_mul_shared));
- contexts[i].ID = i;
- }
-
- for (size_t i = 0; i < share.Loops; i++)
- {
- if (pthread_create(threads + i, NULL, &cpx_ncpx_mmul_mtc, contexts + i))
- {
- fprintf(stderr, "QAnsel: Thread error. (1)\n");
- exit(1);
- }
- }
-
- for (uint32_t i = 0; i < share.Loops; i++)
- {
- pthread_t tid;
- tid = threads[i];
- if (pthread_join(tid, NULL))
- {
- fprintf(stderr, "QAnsel: Thread error. (2)\n");
- }
- }
-}
-
-//non-complex kronecker product
-void cpx_ncpx_mknk(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB)
-{
- size_t rowsR = rowsA * rowsB;
- size_t colsR = colsA * colsB;
- for (size_t rowR = 0; rowR < rowsR; rowR++)
- {
- for (size_t colR = 0; colR < colsR; colR++)
- {
- size_t rowA = rowR / rowsB;
- size_t colA = colR / colsB;
- size_t rowB = rowR % rowsB;
- size_t colB = colR % colsB;
- printf(">%i,%i|%i<\n", colR, rowR, colR + rowR * colsR);
- ptrR[colR + rowR * colsR] =
- ptrA[colA + rowA * colsA]
- * ptrB[colB + rowB * colsB]
- ;
- }
- }
-}
-
float cpx_magsqr(cpx_t* n)
{
return (n->real * n->real) + (n->imaginary * n->imaginary);
return sqrt((n->real * n->real) + (n->imaginary * n->imaginary));
}
-void cpx_mtx_mul(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b)
-{
- r->rows = a->rows;
- r->cols = b->cols;
- cpx_ncpx_mmul(r->ptr, a->ptr, b->ptr, a->rows * 2, b->cols * 2, a->cols * 2);
-}
-
-void cpx_mtx_set(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n)
+void cpx_mtx_set(cpx_mtx_t* m, int row, int col, cpx_t* n)
{
- row *= 2;
- col *= 2;
- size_t cols = m->cols * 2;
- m->ptr[col + row * cols] = n->real;
- m->ptr[(col + 1) + row * cols] = -(n->imaginary);
- m->ptr[col + (row + 1) * cols] = n->imaginary;
- m->ptr[(col + 1) + (row + 1) * cols] = n->real;
+ m->ptr[row * ((m->cols) * 2) + (col * 2)] = n->real;
+ m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = n->imaginary;
}
-void cpx_mtx_set2(cpx_mtx_t* m, size_t row, size_t col, float real, float imaginary)
+void cpx_mtx_set2(cpx_mtx_t* m, int row, int col, float real, float imaginary)
{
- row *= 2;
- col *= 2;
- size_t cols = m->cols * 2;
- m->ptr[col + row * cols] = real;
- m->ptr[(col + 1) + row * cols] = -imaginary;
- m->ptr[col + (row + 1) * cols] = imaginary;
- m->ptr[(col + 1) + (row + 1) * cols] = real;
+ m->ptr[row * ((m->cols) * 2) + (col * 2)] = real;
+ m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = imaginary;
}
-void cpx_mtx_get(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n)
+void cpx_mtx_get(cpx_mtx_t* m, int row, int col, cpx_t* n)
{
- row *= 2;
- col *= 2;
- size_t cols = m->cols * 2;
-
- //printf("qqq\n");
- n->real = m->ptr[(col + 1) + (row + 1) * cols];
- //printf("ppp\n");
- n->imaginary = m->ptr[col + (row + 1) * cols];
- //printf("ggg\n");
+ n->real = m->ptr[row * ((m->cols) * 2) + (col * 2)];
+ n->imaginary = m->ptr[row * ((m->cols) * 2) + (col * 2) + 1];
}
-float cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col)
-{
- row *= 2;
- col *= 2;
- size_t cols = m->cols * 2;
- return m->ptr[(col + 1) + (row + 1) * cols];
-}
-
-float cpx_mtx_get_imaginary(cpx_mtx_t* m, size_t row, size_t col)
-{
- row *= 2;
- col *= 2;
- size_t cols = m->cols * 2;
- return m->ptr[col + (row + 1) * cols];
-}
-
-void cpx_mtx_init(cpx_mtx_t* m, size_t rows, size_t cols)
+void cpx_mtx_init(cpx_mtx_t* m, int rows, int cols)
{
+ int z = rows * (cols * 2) * sizeof(float);
+ m->ptr = malloc(z);
m->rows = rows;
m->cols = cols;
- size_t z = (rows * 2) * (cols * 2);
- m->ptr = malloc(z * sizeof(float));
- for (size_t i = 0; i < z; i++) m->ptr[i] = 0;
+ memset(m->ptr, 0, z);
}
-void cpx_mtx_expand_row(cpx_mtx_t* m)
-{
- m->rows += 1;
- size_t z = (m->rows * 2) * (m->cols * 2);
- m->ptr = realloc(m->ptr, z * sizeof(float));
-}
-
-
void cpx_mtx_free(cpx_mtx_t* m)
{
if (m->ptr != NULL) free(m->ptr);
m->cols = 0;
}
-typedef struct
-{
- size_t ID;
- size_t Threads;
- size_t Last;
- size_t Loops;
- size_t Continue;
- size_t BlockSize;
- float* ptrR;
- size_t rowsR;
- size_t colsR;
- float* ptrA;
- size_t rowsA;
- size_t colsA;
- float* ptrB;
- size_t rowsB;
- size_t colsB;
-} cpx_knk_shared;
-
-void* cpx_ncpx_knk_mtc(void *context)
+void cpx_mtx_print(cpx_mtx_t* m)
{
- cpx_knk_shared* data = (cpx_knk_shared*)context;
- float* ptrR = data->ptrR;
- size_t rowsR = data->rowsR;
- size_t colsR = data->colsR;
- float* ptrA = data->ptrA;
- size_t rowsA = data->rowsA;
- size_t colsA = data->colsA;
- float* ptrB = data->ptrB;
- size_t rowsB = data->rowsB;
- size_t colsB = data->colsB;
-
- for (size_t rowR = 0; rowR < rowsR; rowR++)
+ for (int r = 0; r < m->rows; r++)
{
- size_t a = data->ID * data->BlockSize;
- size_t b = (data->ID + 1) * data->BlockSize;
- if (data->ID == data->Last) b += data->Continue;
- for (size_t colR = a; colR < b; colR++)
+ if (r > 0) printf("\n");
+ for (int c = 0; c < m->cols; c++)
{
- size_t rowA = rowR / rowsB;
- size_t colA = colR / colsB;
- size_t rowB = rowR % rowsB;
- size_t colB = colR % colsB;
-
- float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
- float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
- float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
- float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
-
- float first = r1 * r2; //real
- float outer = r1 * i2; //imaginary
- float inner = i1 * r2; //imaginary
- float last = -(i1 * i2); //real
- r1 = first + last;
- i1 = outer + inner;
-
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
-
+ cpx_t n;
+ cpx_mtx_get(m, r, c, &n);
+ uint8_t* s = cpx_str(&n);
+ if (c > 0) printf(", ");
+ printf("%s", s);
+ free(s);
}
}
}
-void cpx_ncpx_knk_mt
-(
- float* ptrR,
- size_t rowsR,
- size_t colsR,
- float* ptrA,
- size_t rowsA,
- size_t colsA,
- float* ptrB,
- size_t rowsB,
- size_t colsB
-)
+//This is for testing GPU functions on the CPU
+#define __kernel
+#define __global
+int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2;
+int get_global_id(int id)
{
- cpx_knk_shared share;
- share.Threads = get_core_count();
- share.ptrR = ptrR;
- share.rowsR = rowsR;
- share.colsR = colsR;
- share.ptrA = ptrA;
- share.rowsA = rowsA;
- share.colsA = colsA;
- share.ptrB = ptrB;
- share.rowsB = rowsB;
- share.colsB = colsB;
-
- if (colsR <= share.Threads)
+ switch (id)
{
- share.Threads = colsR;
- }
- share.BlockSize = (size_t)floor(((float)colsR) / ((float)share.Threads));
- share.Loops = (size_t)floor(((float)colsR) / ((float)share.BlockSize));
- share.Last = share.Loops - 1;
- share.Continue = (size_t)(((float)colsR) - ((float)share.Loops) * ((float)share.BlockSize));
-
- pthread_t threads[share.Loops];
- cpx_knk_shared contexts[share.Loops];
- for (size_t i = 0; i < share.Loops; i++)
- {
- pthread_t tid;
- threads[i] = tid;
- memcpy(contexts + i, &share, sizeof(cpx_knk_shared));
- contexts[i].ID = i;
+ case 0: return GPU_GLOBAL_ID_0;
+ case 1: return GPU_GLOBAL_ID_1;
+ case 2: return GPU_GLOBAL_ID_2;
}
+}
+#include "kernel.cl"
- for (size_t i = 0; i < share.Loops; i++)
+void cpx_mtx_dot(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
+{
+ for (int i = 0; i < rowsA; i++)
{
- if (pthread_create(threads + i, NULL, &cpx_ncpx_knk_mtc, contexts + i))
+ for (int j = 0; j < colsB; j++)
{
- fprintf(stderr, "QAnsel: Thread error. (1)\n");
- exit(1);
+ GPU_GLOBAL_ID_0 = i;
+ GPU_GLOBAL_ID_1 = j;
+ kernel_dot(ptrR, ptrA, ptrB, rowsA, colsB, shared);
}
}
+}
- for (uint32_t i = 0; i < share.Loops; i++)
+void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB)
+{
+ for (int i = 0; i < rowsR; i++)
{
- pthread_t tid;
- tid = threads[i];
- if (pthread_join(tid, NULL))
+ for (int j = 0; j < colsR; j++)
{
- fprintf(stderr, "QAnsel: Thread error. (2)\n");
+ GPU_GLOBAL_ID_0 = i;
+ GPU_GLOBAL_ID_1 = j;
+ kernel_knk(ptrR, ptrA, ptrB, rowsR, colsR, rowsA, colsA, rowsB, colsB);
}
}
}
-void cpx_ncpx_knk
-(
- float* ptrR,
- size_t rowsR,
- size_t colsR,
- float* ptrA,
- size_t rowsA,
- size_t colsA,
- float* ptrB,
- size_t rowsB,
- size_t colsB
-)
+#ifdef GPU_ENABLED
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#define CL_TARGET_OPENCL_VERSION 300
+#include <CL/cl.h>
+#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); }
+#include "kernel.cl.c"
+cl_platform_id cpx_mtx_platform_id;
+cl_device_id cpx_mtx_device_id;
+cl_context cpx_mtx_context;
+cl_command_queue cpx_mtx_command_queue;
+unsigned char* cpx_mtx_cache = NULL;
+size_t cpx_mtx_cache_len = 0;
+#endif
+
+uint8_t cpx_mtx_begin()
{
- for (size_t rowR = 0; rowR < rowsR; rowR++)
- {
- for (size_t colR = 0; colR < colsR; colR++)
- {
- size_t rowA = rowR / rowsB;
- size_t colA = colR / colsB;
- size_t rowB = rowR % rowsB;
- size_t colB = colR % colsB;
-
- float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
- float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
- float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
- float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
-
- float first = r1 * r2; //real
- float outer = r1 * i2; //imaginary
- float inner = i1 * r2; //imaginary
- float last = -(i1 * i2); //real
- r1 = first + last;
- i1 = outer + inner;
-
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
-
- }
- }
+ #ifdef GPU_ENABLED
+ cl_uint count;
+ cl_int err;
+
+ err = clGetPlatformIDs(1, &cpx_mtx_platform_id, &count);
+ if (err != CL_SUCCESS || count == 0)
+ {
+ if (err == 0)
+ fprintf(stderr, "GPU error: No supported platforms found.\n");
+ else
+ fprintf(stderr, "GPU error: clGetPlatformIDs() failed.\n");
+ return 0;
+ }
+
+ err = clGetDeviceIDs(cpx_mtx_platform_id, CL_DEVICE_TYPE_GPU, 1, &cpx_mtx_device_id, &count);
+ if (err != CL_SUCCESS || count == 0)
+ {
+ if (count == 0)
+ fprintf(stderr, "GPU error: No supported GPUs found.\n");
+ else
+ fprintf(stderr, "GPU error: clGetDeviceIDs() failed.\n");
+ return 0;
+ }
+
+ cpx_mtx_context = clCreateContext(NULL, 1, &cpx_mtx_device_id, NULL, NULL, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU error: clCreateContext() failed.\n");
+ return 0;
+ }
+
+ cpx_mtx_command_queue = clCreateCommandQueue(cpx_mtx_context, cpx_mtx_device_id, 0, &err);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU error: clCreateCommandQueue() failed.\n");
+ err = clReleaseContext(cpx_mtx_context);
+ if (err != CL_SUCCESS)
+ fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
+ return 0;
+ }
+ #endif
+ return 1;
}
-void cpx_mtx_knk(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b)
+void cpx_mtx_clean()
{
- size_t rowsA = a->rows;
- size_t colsA = a->cols;
- size_t rowsB = b->rows;
- size_t colsB = b->cols;
- size_t rowsR = rowsA * rowsB;
- size_t colsR = colsA * colsB;
- for (size_t rowR = 0; rowR < rowsR; rowR++)
- {
- for (size_t colR = 0; colR < colsR; colR++)
- {
- size_t rowA = rowR / rowsB;
- size_t colA = colR / colsB;
- size_t rowB = rowR % rowsB;
- size_t colB = colR % colsB;
- cpx_t n1, n2;
-
- cpx_mtx_get(a, rowA, colA, &n1);
- cpx_mtx_get(b, rowB, colB, &n2);
- cpx_mul(&n1, &n1, &n2);
-
- cpx_mtx_set(r, rowR, colR, &n1);
- }
- }
+ #ifdef GPU_ENABLED
+ cl_int err;
+ err = clReleaseCommandQueue(cpx_mtx_command_queue);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU error: clReleaseCommandQueue() failed.\n");
+ }
+ err = clReleaseContext(cpx_mtx_context);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
+ }
+ free(cpx_mtx_cache);
+ #endif
}
-void cpx_mtx_print(cpx_mtx_t* m)
+#ifdef GPU_ENABLED
+void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared)
{
- for (size_t r = 0; r < m->rows; r++)
- {
- if (r > 0) printf("\n");
- for (size_t c = 0; c < m->cols; c++)
- {
- cpx_t n;
- cpx_mtx_get(m, r, c, &n);
- uint8_t* s = cpx_str(&n);
- if (c > 0) printf(", ");
- printf("%s", s);
- free(s);
- }
- }
+ int colsA = shared;
+ int rowsB = shared;
+ int rowsR = rowsA;
+ int colsR = colsB;
+
+ //Create buffers
+ size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
+ size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
+ size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float);
+ cl_int err;
+ cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err);
+ gpuerr(clCreateBuffer);
+ cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err);
+ gpuerr(clCreateBuffer);
+ cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err);
+ gpuerr(clCreateBuffer);
+
+ //Populate buffers
+ err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
+ gpuerr(clEnqueueWriteBuffer);
+ err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
+ gpuerr(clEnqueueWriteBuffer);
+
+ //Load and compile program
+ cl_program program;
+ if (cpx_mtx_cache == NULL)
+ {
+ program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel_cl}, NULL, &err);
+ gpuerr(clCreateProgramWithSource);
+ err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
+ size_t log_size;
+ clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+ char* log = malloc(log_size);
+ clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+ printf("%s", log);
+ free(log);
+ exit(1);
+ }
+ err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
+ gpuerr(clGetProgramInfo);
+ cpx_mtx_cache = malloc(cpx_mtx_cache_len);
+ clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
+ gpuerr(clGetProgramInfo);
+ }
+ else
+ {
+ program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
+ gpuerr(clCreateProgramWithBinary);
+ }
+
+ //Setup kernel
+ cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); gpuerr(clCreateKernel);
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg);
+
+ //Run the program
+ size_t work_size[] = {rowsR, colsR};
+ err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
+ gpuerr(clEnqueueNDRangeKernel);
+
+ //Wait for completion
+ err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
+ err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
+
+ //Read results
+ err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
+ gpuerr(clEnqueueReadBuffer);
+
+ //Clean up
+ err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
+ err = clReleaseProgram(program); gpuerr(clReleaseProgram);
+ err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
+ err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
+ err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
+}
+
+void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB)
+{
+ //Create buffers
+ size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
+ size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
+ size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float);
+ cl_int err;
+ cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer);
+ cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(clCreateBuffer);
+ cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer);
+
+ //Populate buffers
+ err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
+ gpuerr(clEnqueueWriteBuffer);
+ err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
+ gpuerr(clEnqueueWriteBuffer);
+
+ //Load and compile program
+ cl_program program;
+ if (cpx_mtx_cache == NULL)
+ {
+ program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel_cl}, NULL, &err);
+ gpuerr(clCreateProgramWithSource);
+ err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
+ size_t log_size;
+ clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+ char* log = malloc(log_size);
+ clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
+ printf("%s", log);
+ free(log);
+ exit(1);
+ }
+ err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
+ gpuerr(clGetProgramInfo);
+ cpx_mtx_cache = malloc(cpx_mtx_cache_len);
+ clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
+ gpuerr(clGetProgramInfo);
+ }
+ else
+ {
+ program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
+ gpuerr(clCreateProgramWithBinary);
+ }
+
+ //Setup kernel
+ cl_kernel kernel = clCreateKernel(program, "kernel_knk", &err); gpuerr(clCreateKernel);
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 3, sizeof(int), &rowsR); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 4, sizeof(int), &colsR); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 5, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 6, sizeof(int), &colsA); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 7, sizeof(int), &rowsB); gpuerr(clSetKernelArg);
+ err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg);
+
+ //Run the program
+ size_t work_size[] = {rowsR, colsR};
+ err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
+ gpuerr(clEnqueueNDRangeKernel);
+
+ //Wait for completion
+ err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
+ err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
+
+ //Read results
+ err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
+ gpuerr(clEnqueueReadBuffer);
+
+ //Clean up
+ err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
+ err = clReleaseProgram(program); gpuerr(clReleaseProgram);
+ err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
+ err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
+ err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
}
+#endif
#endif
--- /dev/null
+#ifndef __cpx__
+#define __cpx__
+#include <stdint.h>
+#include <stddef.h>
+#include <math.h>
+#include <pthread.h>
+#include <string.h>
+#include "cores.c"
+
+typedef struct
+{
+ float real, imaginary;
+} cpx_t;
+
+typedef struct
+{
+ float *ptr;
+ size_t rows, cols;
+} cpx_mtx_t;
+
+uint8_t* cpx_str(cpx_t* n)
+{
+ uint8_t* r;
+ size_t z;
+
+ float rl = n->real;
+ float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary);
+ if (ig == 0)
+ {
+ z = snprintf(NULL, 0, "%f", rl);
+ r = malloc(z + 1);
+ sprintf(r, "%f", rl);
+
+ }
+ else
+ {
+ uint8_t op = n->imaginary >= 0 ? '+' : '-';
+ z = snprintf(NULL, 0, "%f %c %fi", rl, op, ig);
+ r = malloc(z + 1);
+ sprintf(r, "%f %c %fi", rl, op, ig);
+ }
+ return r;
+}
+
+cpx_t cpx_new(float r, float i)
+{
+ cpx_t n;
+ n.real = r;
+ n.imaginary = i;
+ return n;
+}
+
+void cpx_add(cpx_t* r, cpx_t* a, cpx_t* b)
+{
+ r->real = a->real + b->real;
+ r->imaginary = a->imaginary + b->imaginary;
+}
+
+void cpx_sub(cpx_t* r, cpx_t* a, cpx_t* b)
+{
+ r->real = a->real - b->real;
+ r->imaginary = a->imaginary - b->imaginary;
+}
+
+void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b)
+{
+ //FOIL
+ float first = a->real * b->real; //real
+ float outer = a->real * b->imaginary; //imaginary
+ float inner = a->imaginary * b->real; //imaginary
+ float last = -(a->imaginary * b->imaginary); //real
+ r->real = first + last;
+ r->imaginary = outer + inner;
+}
+
+//non-complex matrix multiply
+// shared = colsA = rowsB
+void cpx_ncpx_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
+{
+ size_t colsA = shared;
+ size_t rowsB = shared;
+ size_t rowsR = rowsA;
+ size_t colsR = colsB;
+ for (size_t rowR = 0; rowR < rowsR; rowR++)
+ {
+ for (size_t colR = 0; colR < colsR; colR++)
+ {
+ size_t posR = colR + rowR * colsR;
+ size_t rowA = rowR;
+ size_t colB = colR;
+ ptrR[posR] = 0;
+ for (size_t i = 0; i < shared; i++)
+ {
+ size_t posA = i + rowA * colsA;
+ size_t posB = colB + i * colsB;
+ ptrR[posR] += ptrA[posA] * ptrB[posB];
+ }
+ }
+ }
+}
+
+typedef struct
+{
+ size_t ID;
+ size_t Threads;
+ size_t Last;
+ size_t Loops;
+ size_t Continue;
+ size_t BlockSize;
+ float* ptrR;
+ float* ptrA;
+ float* ptrB;
+ size_t rowsA;
+ size_t colsB;
+ size_t shared;
+} cpx_mul_shared;
+
+void* cpx_ncpx_mmul_mtc(void *context)
+{
+ cpx_mul_shared* data = (cpx_mul_shared*)context;
+ float* ptrR = data->ptrR;
+ float* ptrA = data->ptrA;
+ float* ptrB = data->ptrB;
+ size_t rowsA = data->rowsA;
+ size_t colsB = data->colsB;
+ size_t shared = data->shared;
+
+ size_t colsA = data->shared;
+ size_t rowsB = data->shared;
+ size_t rowsR = data->rowsA;
+ size_t colsR = data->colsB;
+
+ for (size_t rowR = 0; rowR < rowsR; rowR++)
+ {
+ size_t a = data->ID * data->BlockSize;
+ size_t b = (data->ID + 1) * data->BlockSize;
+ if (data->ID == data->Last) b += data->Continue;
+
+ //printf("%i;%i\n", a, b);
+
+ for (size_t colR = a; colR < b; colR++)
+ {
+ size_t posR = colR + rowR * colsR;
+ size_t rowA = rowR;
+ size_t colB = colR;
+ ptrR[posR] = 0;
+ for (size_t i = 0; i < data->shared; i++)
+ {
+ size_t posA = i + rowA * colsA;
+ size_t posB = colB + i * colsB;
+ data->ptrR[posR] += data->ptrA[posA] * data->ptrB[posB];
+ }
+ }
+ }
+}
+
+void cpx_ncpx_mmul_mt(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
+{
+ cpx_mul_shared share;
+ share.Threads = get_core_count();
+ share.ptrR = ptrR;
+ share.ptrA = ptrA;
+ share.ptrB = ptrB;
+ share.rowsA = rowsA;
+ share.colsB = colsB;
+ share.shared = shared;
+ if (colsB <= share.Threads)
+ {
+ share.Threads = colsB;
+ }
+ share.BlockSize = (size_t)floor(((float)colsB) / ((float)share.Threads));
+ share.Loops = (size_t)floor(((float)colsB) / ((float)share.BlockSize));
+ share.Last = share.Loops - 1;
+ share.Continue = (size_t)(((float)colsB) - ((float)share.Loops) * ((float)share.BlockSize));
+
+ pthread_t threads[share.Loops];
+ cpx_mul_shared contexts[share.Loops];
+ for (size_t i = 0; i < share.Loops; i++)
+ {
+ pthread_t tid;
+ threads[i] = tid;
+ memcpy(contexts + i, &share, sizeof(cpx_mul_shared));
+ contexts[i].ID = i;
+ }
+
+ for (size_t i = 0; i < share.Loops; i++)
+ {
+ if (pthread_create(threads + i, NULL, &cpx_ncpx_mmul_mtc, contexts + i))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (1)\n");
+ exit(1);
+ }
+ }
+
+ for (uint32_t i = 0; i < share.Loops; i++)
+ {
+ pthread_t tid;
+ tid = threads[i];
+ if (pthread_join(tid, NULL))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (2)\n");
+ }
+ }
+}
+
+//non-complex kronecker product
+void cpx_ncpx_mknk(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB)
+{
+ size_t rowsR = rowsA * rowsB;
+ size_t colsR = colsA * colsB;
+ for (size_t rowR = 0; rowR < rowsR; rowR++)
+ {
+ for (size_t colR = 0; colR < colsR; colR++)
+ {
+ size_t rowA = rowR / rowsB;
+ size_t colA = colR / colsB;
+ size_t rowB = rowR % rowsB;
+ size_t colB = colR % colsB;
+ printf(">%i,%i|%i<\n", colR, rowR, colR + rowR * colsR);
+ ptrR[colR + rowR * colsR] =
+ ptrA[colA + rowA * colsA]
+ * ptrB[colB + rowB * colsB]
+ ;
+ }
+ }
+}
+
+float cpx_magsqr(cpx_t* n)
+{
+ return (n->real * n->real) + (n->imaginary * n->imaginary);
+}
+
+float cpx_mag(cpx_t* n)
+{
+ return sqrt((n->real * n->real) + (n->imaginary * n->imaginary));
+}
+
+void cpx_mtx_mul(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b)
+{
+ r->rows = a->rows;
+ r->cols = b->cols;
+ cpx_ncpx_mmul(r->ptr, a->ptr, b->ptr, a->rows * 2, b->cols * 2, a->cols * 2);
+}
+
+void cpx_mtx_set(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n)
+{
+ row *= 2;
+ col *= 2;
+ size_t cols = m->cols * 2;
+ m->ptr[col + row * cols] = n->real;
+ m->ptr[(col + 1) + row * cols] = -(n->imaginary);
+ m->ptr[col + (row + 1) * cols] = n->imaginary;
+ m->ptr[(col + 1) + (row + 1) * cols] = n->real;
+}
+
+void cpx_mtx_set2(cpx_mtx_t* m, size_t row, size_t col, float real, float imaginary)
+{
+ row *= 2;
+ col *= 2;
+ size_t cols = m->cols * 2;
+ m->ptr[col + row * cols] = real;
+ m->ptr[(col + 1) + row * cols] = -imaginary;
+ m->ptr[col + (row + 1) * cols] = imaginary;
+ m->ptr[(col + 1) + (row + 1) * cols] = real;
+}
+
+void cpx_mtx_get(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n)
+{
+ row *= 2;
+ col *= 2;
+ size_t cols = m->cols * 2;
+
+ //printf("qqq\n");
+ n->real = m->ptr[(col + 1) + (row + 1) * cols];
+ //printf("ppp\n");
+ n->imaginary = m->ptr[col + (row + 1) * cols];
+ //printf("ggg\n");
+}
+
+float cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col)
+{
+ row *= 2;
+ col *= 2;
+ size_t cols = m->cols * 2;
+ return m->ptr[(col + 1) + (row + 1) * cols];
+}
+
+float cpx_mtx_get_imaginary(cpx_mtx_t* m, size_t row, size_t col)
+{
+ row *= 2;
+ col *= 2;
+ size_t cols = m->cols * 2;
+ return m->ptr[col + (row + 1) * cols];
+}
+
+void cpx_mtx_init(cpx_mtx_t* m, size_t rows, size_t cols)
+{
+ m->rows = rows;
+ m->cols = cols;
+ size_t z = (rows * 2) * (cols * 2);
+ m->ptr = malloc(z * sizeof(float));
+ for (size_t i = 0; i < z; i++) m->ptr[i] = 0;
+}
+
+void cpx_mtx_expand_row(cpx_mtx_t* m)
+{
+ m->rows += 1;
+ size_t z = (m->rows * 2) * (m->cols * 2);
+ m->ptr = realloc(m->ptr, z * sizeof(float));
+}
+
+
+void cpx_mtx_free(cpx_mtx_t* m)
+{
+ if (m->ptr != NULL) free(m->ptr);
+ m->rows = 0;
+ m->cols = 0;
+}
+
+typedef struct
+{
+ size_t ID;
+ size_t Threads;
+ size_t Last;
+ size_t Loops;
+ size_t Continue;
+ size_t BlockSize;
+ float* ptrR;
+ size_t rowsR;
+ size_t colsR;
+ float* ptrA;
+ size_t rowsA;
+ size_t colsA;
+ float* ptrB;
+ size_t rowsB;
+ size_t colsB;
+} cpx_knk_shared;
+
+void* cpx_ncpx_knk_mtc(void *context)
+{
+ cpx_knk_shared* data = (cpx_knk_shared*)context;
+ float* ptrR = data->ptrR;
+ size_t rowsR = data->rowsR;
+ size_t colsR = data->colsR;
+ float* ptrA = data->ptrA;
+ size_t rowsA = data->rowsA;
+ size_t colsA = data->colsA;
+ float* ptrB = data->ptrB;
+ size_t rowsB = data->rowsB;
+ size_t colsB = data->colsB;
+
+ for (size_t rowR = 0; rowR < rowsR; rowR++)
+ {
+ size_t a = data->ID * data->BlockSize;
+ size_t b = (data->ID + 1) * data->BlockSize;
+ if (data->ID == data->Last) b += data->Continue;
+ for (size_t colR = a; colR < b; colR++)
+ {
+ size_t rowA = rowR / rowsB;
+ size_t colA = colR / colsB;
+ size_t rowB = rowR % rowsB;
+ size_t colB = colR % colsB;
+
+ float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
+ float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
+ float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
+ float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
+
+ float first = r1 * r2; //real
+ float outer = r1 * i2; //imaginary
+ float inner = i1 * r2; //imaginary
+ float last = -(i1 * i2); //real
+ r1 = first + last;
+ i1 = outer + inner;
+
+ ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
+ ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
+ ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
+ ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
+
+ }
+ }
+}
+
+void cpx_ncpx_knk_mt
+(
+ float* ptrR,
+ size_t rowsR,
+ size_t colsR,
+ float* ptrA,
+ size_t rowsA,
+ size_t colsA,
+ float* ptrB,
+ size_t rowsB,
+ size_t colsB
+)
+{
+ cpx_knk_shared share;
+ share.Threads = get_core_count();
+ share.ptrR = ptrR;
+ share.rowsR = rowsR;
+ share.colsR = colsR;
+ share.ptrA = ptrA;
+ share.rowsA = rowsA;
+ share.colsA = colsA;
+ share.ptrB = ptrB;
+ share.rowsB = rowsB;
+ share.colsB = colsB;
+
+ if (colsR <= share.Threads)
+ {
+ share.Threads = colsR;
+ }
+ share.BlockSize = (size_t)floor(((float)colsR) / ((float)share.Threads));
+ share.Loops = (size_t)floor(((float)colsR) / ((float)share.BlockSize));
+ share.Last = share.Loops - 1;
+ share.Continue = (size_t)(((float)colsR) - ((float)share.Loops) * ((float)share.BlockSize));
+
+ pthread_t threads[share.Loops];
+ cpx_knk_shared contexts[share.Loops];
+ for (size_t i = 0; i < share.Loops; i++)
+ {
+ pthread_t tid;
+ threads[i] = tid;
+ memcpy(contexts + i, &share, sizeof(cpx_knk_shared));
+ contexts[i].ID = i;
+ }
+
+ for (size_t i = 0; i < share.Loops; i++)
+ {
+ if (pthread_create(threads + i, NULL, &cpx_ncpx_knk_mtc, contexts + i))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (1)\n");
+ exit(1);
+ }
+ }
+
+ for (uint32_t i = 0; i < share.Loops; i++)
+ {
+ pthread_t tid;
+ tid = threads[i];
+ if (pthread_join(tid, NULL))
+ {
+ fprintf(stderr, "QAnsel: Thread error. (2)\n");
+ }
+ }
+}
+
+void cpx_ncpx_knk
+(
+ float* ptrR,
+ size_t rowsR,
+ size_t colsR,
+ float* ptrA,
+ size_t rowsA,
+ size_t colsA,
+ float* ptrB,
+ size_t rowsB,
+ size_t colsB
+)
+{
+ for (size_t rowR = 0; rowR < rowsR; rowR++)
+ {
+ for (size_t colR = 0; colR < colsR; colR++)
+ {
+ size_t rowA = rowR / rowsB;
+ size_t colA = colR / colsB;
+ size_t rowB = rowR % rowsB;
+ size_t colB = colR % colsB;
+
+ float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
+ float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
+ float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
+ float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
+
+ float first = r1 * r2; //real
+ float outer = r1 * i2; //imaginary
+ float inner = i1 * r2; //imaginary
+ float last = -(i1 * i2); //real
+ r1 = first + last;
+ i1 = outer + inner;
+
+ ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
+ ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
+ ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
+ ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
+
+ }
+ }
+}
+
+void cpx_mtx_knk(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b)
+{
+ size_t rowsA = a->rows;
+ size_t colsA = a->cols;
+ size_t rowsB = b->rows;
+ size_t colsB = b->cols;
+ size_t rowsR = rowsA * rowsB;
+ size_t colsR = colsA * colsB;
+ for (size_t rowR = 0; rowR < rowsR; rowR++)
+ {
+ for (size_t colR = 0; colR < colsR; colR++)
+ {
+ size_t rowA = rowR / rowsB;
+ size_t colA = colR / colsB;
+ size_t rowB = rowR % rowsB;
+ size_t colB = colR % colsB;
+ cpx_t n1, n2;
+
+ cpx_mtx_get(a, rowA, colA, &n1);
+ cpx_mtx_get(b, rowB, colB, &n2);
+ cpx_mul(&n1, &n1, &n2);
+
+ cpx_mtx_set(r, rowR, colR, &n1);
+ }
+ }
+}
+
+void cpx_mtx_print(cpx_mtx_t* m)
+{
+ for (size_t r = 0; r < m->rows; r++)
+ {
+ if (r > 0) printf("\n");
+ for (size_t c = 0; c < m->cols; c++)
+ {
+ cpx_t n;
+ cpx_mtx_get(m, r, c, &n);
+ uint8_t* s = cpx_str(&n);
+ if (c > 0) printf(", ");
+ printf("%s", s);
+ free(s);
+ }
+ }
+}
+
+#endif
+++ /dev/null
-#ifndef __cpx__
-#define __cpx__
-#include <stdint.h>
-#include <stddef.h>
-#include <math.h>
-#include <pthread.h>
-#include <string.h>
-#include "cores.c"
-#define GPU_ENABLED
-typedef struct
-{
- float real, imaginary;
-} cpx_t;
-
-typedef struct
-{
- float *ptr;
- int rows, cols;
-} cpx_mtx_t;
-
-uint8_t* cpx_str(cpx_t* n)
-{
- uint8_t* r;
- int z;
-
- float rl = n->real;
- float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary);
- if (ig == 0)
- {
- z = snprintf(NULL, 0, "%f", rl);
- r = malloc(z + 1);
- sprintf(r, "%f", rl);
-
- }
- else
- {
- uint8_t op = n->imaginary >= 0 ? '+' : '-';
- z = snprintf(NULL, 0, "%f %c %fi", rl, op, ig);
- r = malloc(z + 1);
- sprintf(r, "%f %c %fi", rl, op, ig);
- }
- return r;
-}
-
-cpx_t cpx_new(float r, float i)
-{
- cpx_t n;
- n.real = r;
- n.imaginary = i;
- return n;
-}
-
-void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b)
-{
- //FOIL
- float first = a->real * b->real; //real
- float outer = a->real * b->imaginary; //imaginary
- float inner = a->imaginary * b->real; //imaginary
- float last = -(a->imaginary * b->imaginary); //real
- r->real = first + last;
- r->imaginary = outer + inner;
-}
-
-float cpx_magsqr(cpx_t* n)
-{
- return (n->real * n->real) + (n->imaginary * n->imaginary);
-}
-
-float cpx_mag(cpx_t* n)
-{
- return sqrt((n->real * n->real) + (n->imaginary * n->imaginary));
-}
-
-void cpx_mtx_set(cpx_mtx_t* m, int row, int col, cpx_t* n)
-{
- m->ptr[row * ((m->cols) * 2) + (col * 2)] = n->real;
- m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = n->imaginary;
-}
-
-void cpx_mtx_set2(cpx_mtx_t* m, int row, int col, float real, float imaginary)
-{
- m->ptr[row * ((m->cols) * 2) + (col * 2)] = real;
- m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = imaginary;
-}
-
-void cpx_mtx_get(cpx_mtx_t* m, int row, int col, cpx_t* n)
-{
- n->real = m->ptr[row * ((m->cols) * 2) + (col * 2)];
- n->imaginary = m->ptr[row * ((m->cols) * 2) + (col * 2) + 1];
-}
-
-void cpx_mtx_init(cpx_mtx_t* m, int rows, int cols)
-{
- int z = rows * (cols * 2) * sizeof(float);
- m->ptr = malloc(z);
- m->rows = rows;
- m->cols = cols;
- memset(m->ptr, 0, z);
-}
-
-void cpx_mtx_free(cpx_mtx_t* m)
-{
- if (m->ptr != NULL) free(m->ptr);
- m->rows = 0;
- m->cols = 0;
-}
-
-void cpx_mtx_print(cpx_mtx_t* m)
-{
- for (int r = 0; r < m->rows; r++)
- {
- if (r > 0) printf("\n");
- for (int c = 0; c < m->cols; c++)
- {
- cpx_t n;
- cpx_mtx_get(m, r, c, &n);
- uint8_t* s = cpx_str(&n);
- if (c > 0) printf(", ");
- printf("%s", s);
- free(s);
- }
- }
-}
-
-//This is for testing GPU functions on the CPU
-#define __kernel
-#define __global
-int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2;
-int get_global_id(int id)
-{
- switch (id)
- {
- case 0: return GPU_GLOBAL_ID_0;
- case 1: return GPU_GLOBAL_ID_1;
- case 2: return GPU_GLOBAL_ID_2;
- }
-}
-#include "kernel2.cl"
-
-void cpx_mtx_dot(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
-{
- for (int i = 0; i < rowsA; i++)
- {
- for (int j = 0; j < colsB; j++)
- {
- GPU_GLOBAL_ID_0 = i;
- GPU_GLOBAL_ID_1 = j;
- kernel_dot(ptrR, ptrA, ptrB, rowsA, colsB, shared);
- }
- }
-}
-
-void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB)
-{
- for (int i = 0; i < rowsR; i++)
- {
- for (int j = 0; j < colsR; j++)
- {
- GPU_GLOBAL_ID_0 = i;
- GPU_GLOBAL_ID_1 = j;
- kernel_knk(ptrR, ptrA, ptrB, rowsR, colsR, rowsA, colsA, rowsB, colsB);
- }
- }
-}
-
-#ifdef GPU_ENABLED
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_TARGET_OPENCL_VERSION 120
-#include <CL/cl.h>
-#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); }
-#include "kernel2.cl.c"
-cl_platform_id cpx_mtx_platform_id;
-cl_device_id cpx_mtx_device_id;
-cl_context cpx_mtx_context;
-cl_command_queue cpx_mtx_command_queue;
-unsigned char* cpx_mtx_cache = NULL;
-size_t cpx_mtx_cache_len = 0;
-#endif
-
-uint8_t cpx_mtx_begin()
-{
- #ifdef GPU_ENABLED
- cl_uint count;
- cl_int err;
-
- err = clGetPlatformIDs(1, &cpx_mtx_platform_id, &count);
- if (err != CL_SUCCESS || count == 0)
- {
- if (err == 0)
- fprintf(stderr, "GPU error: No supported platforms found.\n");
- else
- fprintf(stderr, "GPU error: clGetPlatformIDs() failed.\n");
- return 0;
- }
-
- err = clGetDeviceIDs(cpx_mtx_platform_id, CL_DEVICE_TYPE_GPU, 1, &cpx_mtx_device_id, &count);
- if (err != CL_SUCCESS || count == 0)
- {
- if (count == 0)
- fprintf(stderr, "GPU error: No supported GPUs found.\n");
- else
- fprintf(stderr, "GPU error: clGetDeviceIDs() failed.\n");
- return 0;
- }
-
- cpx_mtx_context = clCreateContext(NULL, 1, &cpx_mtx_device_id, NULL, NULL, &err);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clCreateContext() failed.\n");
- return 0;
- }
-
- cpx_mtx_command_queue = clCreateCommandQueue(cpx_mtx_context, cpx_mtx_device_id, 0, &err);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clCreateCommandQueue() failed.\n");
- err = clReleaseContext(cpx_mtx_context);
- if (err != CL_SUCCESS)
- fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
- return 0;
- }
- #endif
- return 1;
-}
-
-void cpx_mtx_clean()
-{
- #ifdef GPU_ENABLED
- cl_int err;
- err = clReleaseCommandQueue(cpx_mtx_command_queue);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clReleaseCommandQueue() failed.\n");
- }
- err = clReleaseContext(cpx_mtx_context);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
- }
- free(cpx_mtx_cache);
- #endif
-}
-
-#ifdef GPU_ENABLED
-void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared)
-{
- int colsA = shared;
- int rowsB = shared;
- int rowsR = rowsA;
- int colsR = colsB;
-
- //Create buffers
- size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
- size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
- size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float);
- cl_int err;
- cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err);
- gpuerr(clCreateBuffer);
- cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err);
- gpuerr(clCreateBuffer);
- cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err);
- gpuerr(clCreateBuffer);
-
- //Populate buffers
- err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
- err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
-
- //Load and compile program
- cl_program program;
- if (cpx_mtx_cache == NULL)
- {
- program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel2_cl}, NULL, &err);
- gpuerr(clCreateProgramWithSource);
- err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
- gpuerr(clGetProgramInfo);
- cpx_mtx_cache = malloc(cpx_mtx_cache_len);
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
- gpuerr(clGetProgramInfo);
- }
- else
- {
- program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
- gpuerr(clCreateProgramWithBinary);
- }
-
- //Setup kernel
- cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); gpuerr(clCreateKernel);
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg);
-
- //Run the program
- size_t work_size[] = {rowsR, colsR};
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
- gpuerr(clEnqueueNDRangeKernel);
-
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
- err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
-
- //Read results
- err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
- gpuerr(clEnqueueReadBuffer);
-
- //Clean up
- err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
- err = clReleaseProgram(program); gpuerr(clReleaseProgram);
- err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-
-void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB)
-{
- //Create buffers
- size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float);
- size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float);
- size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float);
- cl_int err;
- cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer);
- cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(clCreateBuffer);
- cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer);
-
- //Populate buffers
- err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
- err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
-
- //Load and compile program
- cl_program program;
- if (cpx_mtx_cache == NULL)
- {
- program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel2_cl}, NULL, &err);
- gpuerr(clCreateProgramWithSource);
- err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL);
- gpuerr(clGetProgramInfo);
- cpx_mtx_cache = malloc(cpx_mtx_cache_len);
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL);
- gpuerr(clGetProgramInfo);
- }
- else
- {
- program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err);
- gpuerr(clCreateProgramWithBinary);
- }
-
- //Setup kernel
- cl_kernel kernel = clCreateKernel(program, "kernel_knk", &err); gpuerr(clCreateKernel);
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 3, sizeof(int), &rowsR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(int), &colsR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 6, sizeof(int), &colsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 7, sizeof(int), &rowsB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg);
-
- //Run the program
- size_t work_size[] = {rowsR, colsR};
- err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
- gpuerr(clEnqueueNDRangeKernel);
-
- //Wait for completion
- err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush);
- err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish);
-
- //Read results
- err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL);
- gpuerr(clEnqueueReadBuffer);
-
- //Clean up
- err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
- err = clReleaseProgram(program); gpuerr(clReleaseProgram);
- err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-#endif
-
-#endif
float Identity[] =
{
- 1, 0, 0, 0,
- 0, 1, 0, 0,
-
- 0, 0, 1, 0,
- 0, 0, 0, 1
+ 1, 0, 0, 0,
+ 0, 0, 1, 0,
};
float PauliX[] =
{
- 0, 0, 1, 0,
- 0, 0, 0, 1,
-
- 1, 0, 0, 0,
- 0, 1, 0, 0
+ 0, 0, 1, 0,
+ 1, 0, 0, 0,
};
float PauliY[] =
{
0, 0, 0, 1,
- 0, 0, -1, 0,
-
0,-1, 0, 0,
- 1, 0, 0, 0,
};
float PauliZ[] =
{
- 1, 0, 0, 0,
- 0, 1, 0, 0,
-
+ 1, 0, 0, 0,
0, 0, -1, 0,
- 0, 0, 0,-1
};
// 1/sqrt(2)
float Hadamard[] =
{
R, 0, R, 0,
- 0, R, 0, R,
-
R, 0, -R, 0,
- 0, R, 0,-R,
};
float PhaseS[] =
{
1, 0, 0, 0,
- 0, 1, 0, 0,
-
0, 0, 0,-1,
- 0, 0, 1, 0
};
// 1/sqrt(2) + 1/sqrt(2)i
float PhaseT[] =
{
1, 0, 0, 0,
- 0, 1, 0, 0,
-
0, 0, R,-R,
- 0, 0, R, R
};
float ControlledNOT[] =
{
1, 0, 0, 0, 0, 0, 0, 0,
- 0, 1, 0, 0, 0, 0, 0, 0,
-
0, 0, 1, 0, 0, 0, 0, 0,
- 0, 0, 0, 1, 0, 0, 0, 0,
-
0, 0, 0, 0, 0, 0, 1, 0,
- 0, 0, 0, 0, 0, 0, 0, 1,
-
0, 0, 0, 0, 1, 0, 0, 0,
- 0, 0, 0, 0, 0, 1, 0, 0,
};
\ No newline at end of file
+++ /dev/null
-
-float Identity[] =
-{
- 1, 0, 0, 0,
- 0, 0, 1, 0,
-};
-
-float PauliX[] =
-{
- 0, 0, 1, 0,
- 1, 0, 0, 0,
-};
-
-float PauliY[] =
-{
- 0, 0, 0, 1,
- 0,-1, 0, 0,
-};
-
-float PauliZ[] =
-{
- 1, 0, 0, 0,
- 0, 0, -1, 0,
-};
-
-// 1/sqrt(2)
-#define R 0.7071067811865475
-float Hadamard[] =
-{
- R, 0, R, 0,
- R, 0, -R, 0,
-};
-
-float PhaseS[] =
-{
- 1, 0, 0, 0,
- 0, 0, 0,-1,
-};
-
-// 1/sqrt(2) + 1/sqrt(2)i
-float PhaseT[] =
-{
- 1, 0, 0, 0,
- 0, 0, R,-R,
-};
-
-float ControlledNOT[] =
-{
- 1, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 1, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 1, 0,
- 0, 0, 0, 0, 1, 0, 0, 0,
-};
\ No newline at end of file
+++ /dev/null
-#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); }
-#include "kernel.cl.c"
-cl_platform_id GPU_platform_id;
-cl_device_id GPU_device_id;
-cl_context GPU_context;
-cl_command_queue GPU_command_queue;
-unsigned char* GPU_cache = NULL;
-size_t GPU_cache_len = 0;
-
-uint8_t GPU_init()
-{
- cl_uint count;
- cl_int err;
-
- err = clGetPlatformIDs(1, &GPU_platform_id, &count);
- if (err != CL_SUCCESS || count == 0)
- {
- if (err == 0)
- fprintf(stderr, "GPU error: No supported platforms found.\n");
- else
- fprintf(stderr, "GPU error: clGetPlatformIDs() failed.\n");
- return 0;
- }
-
- err = clGetDeviceIDs(GPU_platform_id, CL_DEVICE_TYPE_GPU, 1, &GPU_device_id, &count);
- if (err != CL_SUCCESS || count == 0)
- {
- if (count == 0)
- fprintf(stderr, "GPU error: No supported GPUs found.\n");
- else
- fprintf(stderr, "GPU error: clGetDeviceIDs() failed.\n");
- return 0;
- }
-
- GPU_context = clCreateContext(NULL, 1, &GPU_device_id, NULL, NULL, &err);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clCreateContext() failed.\n");
- return 0;
- }
-
- GPU_command_queue = clCreateCommandQueue(GPU_context, GPU_device_id, 0, &err);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clCreateCommandQueue() failed.\n");
- err = clReleaseContext(GPU_context);
- if (err != CL_SUCCESS)
- fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
- return 0;
- }
-
- return 1;
-}
-
-void GPU_clean()
-{
- cl_int err;
- err = clReleaseCommandQueue(GPU_command_queue);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clReleaseCommandQueue() failed.\n");
- }
- err = clReleaseContext(GPU_context);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU error: clReleaseContext() failed.\n");
- }
- free(GPU_cache);
-}
-
-void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared)
-{
- //Create buffers
- size_t sizeA = rowsA * shared;
- size_t sizeB = shared * colsB;
- size_t sizeR = rowsA * colsB;
- cl_int err;
- cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err);
- gpuerr(clCreateBuffer);
- cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err);
- gpuerr(clCreateBuffer);
- cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err);
- gpuerr(clCreateBuffer);
-
- //Populate buffers
- err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
- err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
-
- //Load and compile program
- cl_program program;
- if (GPU_cache == NULL)
- {
- program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_kernel_cl}, NULL, &err);
- gpuerr(clCreateProgramWithSource);
- err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &GPU_cache_len, NULL);
- gpuerr(clGetProgramInfo);
- GPU_cache = malloc(GPU_cache_len);
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &GPU_cache, NULL);
- gpuerr(clGetProgramInfo);
- }
- else
- {
- program = clCreateProgramWithBinary(GPU_context, 1, &GPU_device_id, &GPU_cache_len, (const unsigned char**)&GPU_cache, NULL, &err);
- gpuerr(clCreateProgramWithBinary);
- }
-
- //Setup kernel
- cl_kernel kernel = clCreateKernel(program, "gpu_mmul", &err); gpuerr(clCreateKernel);
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg);
-
- //Run the program
- size_t work_size[] = {rowsA, colsB};
- err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
- gpuerr(clEnqueueNDRangeKernel);
-
- //Wait for completion
- err = clFlush(GPU_command_queue); gpuerr(clFlush);
- err = clFinish(GPU_command_queue); gpuerr(clFinish);
-
- //Read results
- err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL);
- gpuerr(clEnqueueReadBuffer);
-
- //Clean up
- err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
- err = clReleaseProgram(program); gpuerr(clReleaseProgram);
- err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
-
-void GPU_knk(float* ptrR, int rowsR, int colsR, float* ptrA, int rowsA, int colsA, float* ptrB, int rowsB, int colsB)
-{
- //Create buffers
- size_t sizeA = (rowsA * 2) * (colsA * 2);
- size_t sizeB = (rowsB * 2) * (colsB * 2);
- size_t sizeR = (rowsR * 2) * (colsR * 2);
- cl_int err;
- cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err);
- gpuerr(clCreateBuffer);
- cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err);
- gpuerr(clCreateBuffer);
- cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err);
- gpuerr(clCreateBuffer);
-
- //Populate buffers
- err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
- err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL);
- gpuerr(clEnqueueWriteBuffer);
-
- //Load and compile program
- cl_program program;
- if (GPU_cache == NULL)
- {
- program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_kernel_cl}, NULL, &err);
- gpuerr(clCreateProgramWithSource);
- err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL);
- if (err != CL_SUCCESS)
- {
- fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n");
- size_t log_size;
- clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
- char* log = malloc(log_size);
- clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
- printf("%s", log);
- free(log);
- exit(1);
- }
- err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &GPU_cache_len, NULL); gpuerr(clGetProgramInfo);
- GPU_cache = malloc(GPU_cache_len);
- clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &GPU_cache, NULL); gpuerr(clGetProgramInfo);
- }
- else
- {
- program = clCreateProgramWithBinary(GPU_context, 1, &GPU_device_id, &GPU_cache_len, (const unsigned char**)&GPU_cache, NULL, &err);
- gpuerr(clCreateProgramWithBinary);
- }
-
- //Setup kernel
- cl_kernel kernel = clCreateKernel(program, "gpu_knk", &err); gpuerr(clCreateKernel);
- err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 3, sizeof(int), &rowsR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 4, sizeof(int), &colsR); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 5, sizeof(int), &rowsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 6, sizeof(int), &colsA); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 7, sizeof(int), &rowsB); gpuerr(clSetKernelArg);
- err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg);
-
- //Run the program
- size_t work_size[] = {rowsR, colsR};
- err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL);
- gpuerr(clEnqueueNDRangeKernel);
-
- //Wait for completion
- err = clFlush(GPU_command_queue); gpuerr(clFlush);
- err = clFinish(GPU_command_queue); gpuerr(clFinish);
-
- //Read results
- err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL);
- gpuerr(clEnqueueReadBuffer);
-
- //Clean up
- err = clReleaseKernel(kernel); gpuerr(clReleaseKernel);
- err = clReleaseProgram(program); gpuerr(clReleaseProgram);
- err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject);
- err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject);
-}
\ No newline at end of file
+++ /dev/null
-//This is for testing GPU functions on the CPU
-#define __kernel
-#define __global
-int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2;
-int get_global_id(int id)
-{
- switch (id)
- {
- case 0: return GPU_GLOBAL_ID_0;
- case 1: return GPU_GLOBAL_ID_1;
- case 2: return GPU_GLOBAL_ID_2;
- }
-}
-#include "gpu_mmul.cl"
-#include "gpu_knk.cl"
-
-void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared)
-{
- for (int i = 0; i < rowsA; i++)
- {
- for (int j = 0; j < colsB; j++)
- {
- GPU_GLOBAL_ID_0 = i;
- GPU_GLOBAL_ID_1 = j;
- gpu_mmul(ptrR, ptrA, ptrB, rowsA, colsB, shared);
- }
- }
-}
-
-void GPU_knk(float* ptrR, int rowsR, int colsR, float* ptrA, int rowsA, int colsA, float* ptrB, int rowsB, int colsB)
-{
- for (int i = 0; i < rowsR; i++)
- {
- for (int j = 0; j < colsR; j++)
- {
- GPU_GLOBAL_ID_0 = i;
- GPU_GLOBAL_ID_1 = j;
- gpu_knk(ptrR, rowsR, colsR, ptrA, rowsA, colsA, ptrB, rowsB, colsB);
- }
- }
-}
-
-uint8_t GPU_init()
-{
- return 1;
-}
-
-void GPU_clean() {}
+++ /dev/null
-__kernel void gpu_mmul
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsA,
- const int colsB,
- const int shared
-)
-{
- const int colsA = shared;
- const int rowsB = shared;
- const int rowsR = rowsA;
- const int colsR = colsB;
- const int rowR = get_global_id(0);
- const int colR = get_global_id(1);
- int posA, posB;
- float sum = 0;
-
- const int posR = colR + rowR * colsR;
-
- for (int i = 0; i < shared; i++)
- {
- int posA = i + rowR * colsA;
- int posB = colR + i * colsB;
- sum += ptrA[posA] * ptrB[posB];
- }
- ptrR[rowR * colsR + colR] = sum;
-}
-
-__kernel void gpu_knk
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsR,
- const int colsR,
- const int rowsA,
- const int colsA,
- const int rowsB,
- const int colsB
-)
-{
- int rowR = get_global_id(0);
- int colR = get_global_id(1);
-
- int rowA = rowR / rowsB;
- int colA = colR / colsB;
- int rowB = rowR % rowsB;
- int colB = colR % colsB;
-
- float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)];
- float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)];
- float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)];
- float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)];
-
- float first = r1 * r2; //real
- float outer = r1 * i2; //imaginary
- float inner = i1 * r2; //imaginary
- float last = -(i1 * i2); //real
- r1 = first + last;
- i1 = outer + inner;
-
- ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1;
- ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1;
- ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1;
- ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1;
-}
-
--- /dev/null
+__kernel void kernel_dot
+(
+ __global float* ptrR,
+ __global float* ptrA,
+ __global float* ptrB,
+ const int rowsA,
+ const int colsB,
+ const int shared
+)
+{
+ const int colsA = shared;
+ const int rowsB = shared;
+ const int rowsR = rowsA;
+ const int colsR = colsB;
+ const int rowR = get_global_id(0);
+ const int colR = get_global_id(1);
+ int posA, posB;
+ float rR = 0;
+ float iR = 0;
+
+ const int posR = rowR * (colsR * 2) + (colR * 2);
+
+ for (int i = 0; i < shared; i++)
+ {
+ int posA = rowR * (colsA * 2) + (i * 2);
+ int posB = i * (colsB * 2) + (colR * 2);
+
+ float rA = ptrA[posA];
+ float iA = ptrA[posA + 1];
+ float rB = ptrB[posB];
+ float iB = ptrB[posB + 1];
+
+ //(rA + iA)(rB + iB)
+ float first = rA * rB;
+ float outer = rA * iB;
+ float inner = iA * rB;
+ float lasts = iA * iB;
+
+ rR += first + lasts;
+ iR += outer + inner;
+ }
+ ptrR[rowR * (colsR * 2) + (colR * 2)] = rR;
+ ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR;
+}
+
+__kernel void kernel_knk
+(
+ __global float* ptrR,
+ __global float* ptrA,
+ __global float* ptrB,
+ const int rowsR,
+ const int colsR,
+ const int rowsA,
+ const int colsA,
+ const int rowsB,
+ const int colsB
+)
+{
+ int rowR = get_global_id(0);
+ int colR = get_global_id(1);
+
+ int rowA = rowR / rowsB;
+ int colA = colR / colsB;
+ int rowB = rowR % rowsB;
+ int colB = colR % colsB;
+
+ int posA = rowA * (colsA * 2) + (colA * 2);
+ int posB = rowB * (colsB * 2) + (colB * 2);
+
+ float rA = ptrA[posA];
+ float iA = ptrA[posA + 1];
+ float rB = ptrB[posB];
+ float iB = ptrB[posB + 1];
+
+ //(rA + iA)(rB + iB)
+ float first = rA * rB;
+ float outer = rA * iB;
+ float inner = iA * rB;
+ float lasts = iA * iB;
+ ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts;
+ ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner;
+}
+
+++ /dev/null
-__kernel void kernel_dot
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsA,
- const int colsB,
- const int shared
-)
-{
- const int colsA = shared;
- const int rowsB = shared;
- const int rowsR = rowsA;
- const int colsR = colsB;
- const int rowR = get_global_id(0);
- const int colR = get_global_id(1);
- int posA, posB;
- float rR = 0;
- float iR = 0;
-
- const int posR = rowR * (colsR * 2) + (colR * 2);
-
- for (int i = 0; i < shared; i++)
- {
- int posA = rowR * (colsA * 2) + (i * 2);
- int posB = i * (colsB * 2) + (colR * 2);
-
- float rA = ptrA[posA];
- float iA = ptrA[posA + 1];
- float rB = ptrB[posB];
- float iB = ptrB[posB + 1];
-
- //(rA + iA)(rB + iB)
- float first = rA * rB;
- float outer = rA * iB;
- float inner = iA * rB;
- float lasts = iA * iB;
-
- rR += first + lasts;
- iR += outer + inner;
- }
- ptrR[rowR * (colsR * 2) + (colR * 2)] = rR;
- ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR;
-}
-
-__kernel void kernel_knk
-(
- __global float* ptrR,
- __global float* ptrA,
- __global float* ptrB,
- const int rowsR,
- const int colsR,
- const int rowsA,
- const int colsA,
- const int rowsB,
- const int colsB
-)
-{
- int rowR = get_global_id(0);
- int colR = get_global_id(1);
-
- int rowA = rowR / rowsB;
- int colA = colR / colsB;
- int rowB = rowR % rowsB;
- int colB = colR % colsB;
-
- int posA = rowA * (colsA * 2) + (colA * 2);
- int posB = rowB * (colsB * 2) + (colB * 2);
-
- float rA = ptrA[posA];
- float iA = ptrA[posA + 1];
- float rB = ptrB[posB];
- float iB = ptrB[posB + 1];
-
- //(rA + iA)(rB + iB)
- float first = rA * rB;
- float outer = rA * iB;
- float inner = iA * rB;
- float lasts = iA * iB;
- ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts;
- ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner;
-}
-
+++ /dev/null
-unsigned char src_kernel2_cl[] = {
- 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69,
- 0x64, 0x20, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x64, 0x6f, 0x74,
- 0x0a, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f,
- 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70,
- 0x74, 0x72, 0x52, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67,
- 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a,
- 0x20, 0x70, 0x74, 0x72, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f,
- 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61,
- 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72,
- 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
- 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73,
- 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
- 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x0a,
- 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
- 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20,
- 0x3d, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20,
- 0x72, 0x6f, 0x77, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x73, 0x68, 0x61, 0x72,
- 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73,
- 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20,
- 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x3b, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63,
- 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42,
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
- 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67,
- 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64,
- 0x28, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
- 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20,
- 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c,
- 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x2c, 0x20, 0x70, 0x6f,
- 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61,
- 0x74, 0x20, 0x72, 0x52, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x52, 0x20, 0x3d,
- 0x20, 0x30, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
- 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x52, 0x20,
- 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f,
- 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28,
- 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74,
- 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x20, 0x69, 0x20, 0x3c, 0x20,
- 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x3b, 0x20, 0x69, 0x2b, 0x2b, 0x29,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20,
- 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f,
- 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28,
- 0x69, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42,
- 0x20, 0x3d, 0x20, 0x69, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73,
- 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f,
- 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20,
- 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f,
- 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20,
- 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20,
- 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70,
- 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74,
- 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70,
- 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41,
- 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20,
- 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20,
- 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74,
- 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20,
- 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, 0x65,
- 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f,
- 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69,
- 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x66,
- 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73,
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x52,
- 0x20, 0x2b, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20,
- 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f,
- 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20,
- 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52,
- 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x72, 0x52, 0x3b,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f,
- 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20,
- 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52,
- 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d,
- 0x20, 0x69, 0x52, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x5f, 0x5f, 0x6b, 0x65,
- 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x6b, 0x65,
- 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x6b, 0x6e, 0x6b, 0x0a, 0x28, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20,
- 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x52, 0x2c,
- 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61,
- 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72,
- 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f,
- 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70,
- 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
- 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52,
- 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
- 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x2c, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74,
- 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f,
- 0x6c, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
- 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42,
- 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
- 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x0a, 0x29, 0x0a,
- 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f,
- 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f,
- 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20,
- 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c,
- 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
- 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x41, 0x20, 0x3d, 0x20,
- 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42,
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f,
- 0x6c, 0x41, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2f, 0x20,
- 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69,
- 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f,
- 0x77, 0x52, 0x20, 0x25, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a,
- 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x42,
- 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, 0x6f,
- 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e,
- 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77,
- 0x41, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a,
- 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20,
- 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e,
- 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77,
- 0x42, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a,
- 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x42, 0x20,
- 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66,
- 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74,
- 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20,
- 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d,
- 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b,
- 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f,
- 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42,
- 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70,
- 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31,
- 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72,
- 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b,
- 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f,
- 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x3d, 0x20, 0x72,
- 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
- 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20,
- 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20,
- 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e,
- 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42,
- 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20,
- 0x6c, 0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a,
- 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72,
- 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f,
- 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28,
- 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d,
- 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73,
- 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52,
- 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c,
- 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63,
- 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31,
- 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20,
- 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x7d, 0x00
-};
-unsigned int src_kernel2_cl_len = 1965;