From: miha-q <> Date: Mon, 4 Mar 2024 15:29:50 +0000 (-0500) Subject: Mon Mar 4 10:29:50 AM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=85822b73099c0515f69f864f8949890463766142;p=QAnsel.git Mon Mar 4 10:29:50 AM EST 2024 --- diff --git a/Makefile b/Makefile index c39e5ad..99b4eae 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,10 @@ all: - #gpu mmul - mv src/kernel2.cl src/.kernel2.cl - bash -c 'echo -ne "$$(cat src/.kernel2.cl)\x00" > src/kernel2.cl' - xxd -i src/kernel2.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/kernel2.cl.c - mv src/.kernel2.cl src/kernel2.cl - - #gpu mmul - #mv src/gpu/kernel.cl src/gpu/.kernel.cl - #bash -c 'echo -ne "$$(cat src/gpu/.kernel.cl)\x00" > src/gpu/kernel.cl' - #xxd -i src/gpu/kernel.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/gpu/kernel.cl.c - #mv src/gpu/.kernel.cl src/gpu/kernel.cl + mv src/kernel.cl src/.kernel.cl + bash -c 'echo -ne "$$(cat src/.kernel.cl)\x00" > src/kernel.cl' + xxd -i src/kernel.cl | sed -e 's/gpu_gpu_/gpu_/g' > src/kernel.cl.c + mv src/.kernel.cl src/kernel.cl gcc src/QAnsel.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread - rm -f src/gpu/*.cl.c + rm -f src/*.cl.c diff --git a/src/QAnsel.c b/src/QAnsel.c index eda7af4..86786e9 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -2,8 +2,8 @@ #include #include #include -#include "complex2.c" -#include "gates2.c" +#include "complex.c" +#include "gates.c" #include "display.c" #include "chacha20.c" #define QUBITS_MAX 14 diff --git a/src/complex.c b/src/complex.c index 54548e3..2b2ddd9 100644 --- a/src/complex.c +++ b/src/complex.c @@ -6,7 +6,7 @@ #include #include #include "cores.c" - +#define GPU_ENABLED typedef struct { float real, imaginary; @@ -15,13 +15,13 @@ typedef struct typedef struct { float *ptr; - size_t rows, cols; + int rows, cols; } cpx_mtx_t; uint8_t* cpx_str(cpx_t* n) { uint8_t* r; - size_t z; + int z; float rl = n->real; float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary); @@ -50,18 +50,6 @@ cpx_t cpx_new(float r, float i) return n; } -void cpx_add(cpx_t* r, cpx_t* a, cpx_t* b) -{ - r->real = a->real + b->real; - r->imaginary = a->imaginary + b->imaginary; -} - -void cpx_sub(cpx_t* r, cpx_t* a, cpx_t* b) -{ - r->real = a->real - b->real; - r->imaginary = a->imaginary - b->imaginary; -} - void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b) { //FOIL @@ -73,158 +61,6 @@ void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b) r->imaginary = outer + inner; } -//non-complex matrix multiply -// shared = colsA = rowsB -void cpx_ncpx_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) -{ - size_t colsA = shared; - size_t rowsB = shared; - size_t rowsR = rowsA; - size_t colsR = colsB; - for (size_t rowR = 0; rowR < rowsR; rowR++) - { - for (size_t colR = 0; colR < colsR; colR++) - { - size_t posR = colR + rowR * colsR; - size_t rowA = rowR; - size_t colB = colR; - ptrR[posR] = 0; - for (size_t i = 0; i < shared; i++) - { - size_t posA = i + rowA * colsA; - size_t posB = colB + i * colsB; - ptrR[posR] += ptrA[posA] * ptrB[posB]; - } - } - } -} - -typedef struct -{ - size_t ID; - size_t Threads; - size_t Last; - size_t Loops; - size_t Continue; - size_t BlockSize; - float* ptrR; - float* ptrA; - float* ptrB; - size_t rowsA; - size_t colsB; - size_t shared; -} cpx_mul_shared; - -void* cpx_ncpx_mmul_mtc(void *context) -{ - cpx_mul_shared* data = (cpx_mul_shared*)context; - float* ptrR = data->ptrR; - float* ptrA = data->ptrA; - float* ptrB = data->ptrB; - size_t rowsA = data->rowsA; - size_t colsB = data->colsB; - size_t shared = data->shared; - - size_t colsA = data->shared; - size_t rowsB = data->shared; - size_t rowsR = data->rowsA; - size_t colsR = data->colsB; - - for (size_t rowR = 0; rowR < rowsR; rowR++) - { - size_t a = data->ID * data->BlockSize; - size_t b = (data->ID + 1) * data->BlockSize; - if (data->ID == data->Last) b += data->Continue; - - //printf("%i;%i\n", a, b); - - for (size_t colR = a; colR < b; colR++) - { - size_t posR = colR + rowR * colsR; - size_t rowA = rowR; - size_t colB = colR; - ptrR[posR] = 0; - for (size_t i = 0; i < data->shared; i++) - { - size_t posA = i + rowA * colsA; - size_t posB = colB + i * colsB; - data->ptrR[posR] += data->ptrA[posA] * data->ptrB[posB]; - } - } - } -} - -void cpx_ncpx_mmul_mt(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) -{ - cpx_mul_shared share; - share.Threads = get_core_count(); - share.ptrR = ptrR; - share.ptrA = ptrA; - share.ptrB = ptrB; - share.rowsA = rowsA; - share.colsB = colsB; - share.shared = shared; - if (colsB <= share.Threads) - { - share.Threads = colsB; - } - share.BlockSize = (size_t)floor(((float)colsB) / ((float)share.Threads)); - share.Loops = (size_t)floor(((float)colsB) / ((float)share.BlockSize)); - share.Last = share.Loops - 1; - share.Continue = (size_t)(((float)colsB) - ((float)share.Loops) * ((float)share.BlockSize)); - - pthread_t threads[share.Loops]; - cpx_mul_shared contexts[share.Loops]; - for (size_t i = 0; i < share.Loops; i++) - { - pthread_t tid; - threads[i] = tid; - memcpy(contexts + i, &share, sizeof(cpx_mul_shared)); - contexts[i].ID = i; - } - - for (size_t i = 0; i < share.Loops; i++) - { - if (pthread_create(threads + i, NULL, &cpx_ncpx_mmul_mtc, contexts + i)) - { - fprintf(stderr, "QAnsel: Thread error. (1)\n"); - exit(1); - } - } - - for (uint32_t i = 0; i < share.Loops; i++) - { - pthread_t tid; - tid = threads[i]; - if (pthread_join(tid, NULL)) - { - fprintf(stderr, "QAnsel: Thread error. (2)\n"); - } - } -} - -//non-complex kronecker product -void cpx_ncpx_mknk(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) -{ - size_t rowsR = rowsA * rowsB; - size_t colsR = colsA * colsB; - for (size_t rowR = 0; rowR < rowsR; rowR++) - { - for (size_t colR = 0; colR < colsR; colR++) - { - size_t rowA = rowR / rowsB; - size_t colA = colR / colsB; - size_t rowB = rowR % rowsB; - size_t colB = colR % colsB; - printf(">%i,%i|%i<\n", colR, rowR, colR + rowR * colsR); - ptrR[colR + rowR * colsR] = - ptrA[colA + rowA * colsA] - * ptrB[colB + rowB * colsB] - ; - } - } -} - float cpx_magsqr(cpx_t* n) { return (n->real * n->real) + (n->imaginary * n->imaginary); @@ -235,81 +71,33 @@ float cpx_mag(cpx_t* n) return sqrt((n->real * n->real) + (n->imaginary * n->imaginary)); } -void cpx_mtx_mul(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b) -{ - r->rows = a->rows; - r->cols = b->cols; - cpx_ncpx_mmul(r->ptr, a->ptr, b->ptr, a->rows * 2, b->cols * 2, a->cols * 2); -} - -void cpx_mtx_set(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n) +void cpx_mtx_set(cpx_mtx_t* m, int row, int col, cpx_t* n) { - row *= 2; - col *= 2; - size_t cols = m->cols * 2; - m->ptr[col + row * cols] = n->real; - m->ptr[(col + 1) + row * cols] = -(n->imaginary); - m->ptr[col + (row + 1) * cols] = n->imaginary; - m->ptr[(col + 1) + (row + 1) * cols] = n->real; + m->ptr[row * ((m->cols) * 2) + (col * 2)] = n->real; + m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = n->imaginary; } -void cpx_mtx_set2(cpx_mtx_t* m, size_t row, size_t col, float real, float imaginary) +void cpx_mtx_set2(cpx_mtx_t* m, int row, int col, float real, float imaginary) { - row *= 2; - col *= 2; - size_t cols = m->cols * 2; - m->ptr[col + row * cols] = real; - m->ptr[(col + 1) + row * cols] = -imaginary; - m->ptr[col + (row + 1) * cols] = imaginary; - m->ptr[(col + 1) + (row + 1) * cols] = real; + m->ptr[row * ((m->cols) * 2) + (col * 2)] = real; + m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = imaginary; } -void cpx_mtx_get(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n) +void cpx_mtx_get(cpx_mtx_t* m, int row, int col, cpx_t* n) { - row *= 2; - col *= 2; - size_t cols = m->cols * 2; - - //printf("qqq\n"); - n->real = m->ptr[(col + 1) + (row + 1) * cols]; - //printf("ppp\n"); - n->imaginary = m->ptr[col + (row + 1) * cols]; - //printf("ggg\n"); + n->real = m->ptr[row * ((m->cols) * 2) + (col * 2)]; + n->imaginary = m->ptr[row * ((m->cols) * 2) + (col * 2) + 1]; } -float cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col) -{ - row *= 2; - col *= 2; - size_t cols = m->cols * 2; - return m->ptr[(col + 1) + (row + 1) * cols]; -} - -float cpx_mtx_get_imaginary(cpx_mtx_t* m, size_t row, size_t col) -{ - row *= 2; - col *= 2; - size_t cols = m->cols * 2; - return m->ptr[col + (row + 1) * cols]; -} - -void cpx_mtx_init(cpx_mtx_t* m, size_t rows, size_t cols) +void cpx_mtx_init(cpx_mtx_t* m, int rows, int cols) { + int z = rows * (cols * 2) * sizeof(float); + m->ptr = malloc(z); m->rows = rows; m->cols = cols; - size_t z = (rows * 2) * (cols * 2); - m->ptr = malloc(z * sizeof(float)); - for (size_t i = 0; i < z; i++) m->ptr[i] = 0; + memset(m->ptr, 0, z); } -void cpx_mtx_expand_row(cpx_mtx_t* m) -{ - m->rows += 1; - size_t z = (m->rows * 2) * (m->cols * 2); - m->ptr = realloc(m->ptr, z * sizeof(float)); -} - - void cpx_mtx_free(cpx_mtx_t* m) { if (m->ptr != NULL) free(m->ptr); @@ -317,220 +105,307 @@ void cpx_mtx_free(cpx_mtx_t* m) m->cols = 0; } -typedef struct -{ - size_t ID; - size_t Threads; - size_t Last; - size_t Loops; - size_t Continue; - size_t BlockSize; - float* ptrR; - size_t rowsR; - size_t colsR; - float* ptrA; - size_t rowsA; - size_t colsA; - float* ptrB; - size_t rowsB; - size_t colsB; -} cpx_knk_shared; - -void* cpx_ncpx_knk_mtc(void *context) +void cpx_mtx_print(cpx_mtx_t* m) { - cpx_knk_shared* data = (cpx_knk_shared*)context; - float* ptrR = data->ptrR; - size_t rowsR = data->rowsR; - size_t colsR = data->colsR; - float* ptrA = data->ptrA; - size_t rowsA = data->rowsA; - size_t colsA = data->colsA; - float* ptrB = data->ptrB; - size_t rowsB = data->rowsB; - size_t colsB = data->colsB; - - for (size_t rowR = 0; rowR < rowsR; rowR++) + for (int r = 0; r < m->rows; r++) { - size_t a = data->ID * data->BlockSize; - size_t b = (data->ID + 1) * data->BlockSize; - if (data->ID == data->Last) b += data->Continue; - for (size_t colR = a; colR < b; colR++) + if (r > 0) printf("\n"); + for (int c = 0; c < m->cols; c++) { - size_t rowA = rowR / rowsB; - size_t colA = colR / colsB; - size_t rowB = rowR % rowsB; - size_t colB = colR % colsB; - - float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; - float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; - float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; - float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; - - float first = r1 * r2; //real - float outer = r1 * i2; //imaginary - float inner = i1 * r2; //imaginary - float last = -(i1 * i2); //real - r1 = first + last; - i1 = outer + inner; - - ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; - ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; - ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; - ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; - + cpx_t n; + cpx_mtx_get(m, r, c, &n); + uint8_t* s = cpx_str(&n); + if (c > 0) printf(", "); + printf("%s", s); + free(s); } } } -void cpx_ncpx_knk_mt -( - float* ptrR, - size_t rowsR, - size_t colsR, - float* ptrA, - size_t rowsA, - size_t colsA, - float* ptrB, - size_t rowsB, - size_t colsB -) +//This is for testing GPU functions on the CPU +#define __kernel +#define __global +int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2; +int get_global_id(int id) { - cpx_knk_shared share; - share.Threads = get_core_count(); - share.ptrR = ptrR; - share.rowsR = rowsR; - share.colsR = colsR; - share.ptrA = ptrA; - share.rowsA = rowsA; - share.colsA = colsA; - share.ptrB = ptrB; - share.rowsB = rowsB; - share.colsB = colsB; - - if (colsR <= share.Threads) + switch (id) { - share.Threads = colsR; - } - share.BlockSize = (size_t)floor(((float)colsR) / ((float)share.Threads)); - share.Loops = (size_t)floor(((float)colsR) / ((float)share.BlockSize)); - share.Last = share.Loops - 1; - share.Continue = (size_t)(((float)colsR) - ((float)share.Loops) * ((float)share.BlockSize)); - - pthread_t threads[share.Loops]; - cpx_knk_shared contexts[share.Loops]; - for (size_t i = 0; i < share.Loops; i++) - { - pthread_t tid; - threads[i] = tid; - memcpy(contexts + i, &share, sizeof(cpx_knk_shared)); - contexts[i].ID = i; + case 0: return GPU_GLOBAL_ID_0; + case 1: return GPU_GLOBAL_ID_1; + case 2: return GPU_GLOBAL_ID_2; } +} +#include "kernel.cl" - for (size_t i = 0; i < share.Loops; i++) +void cpx_mtx_dot(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) +{ + for (int i = 0; i < rowsA; i++) { - if (pthread_create(threads + i, NULL, &cpx_ncpx_knk_mtc, contexts + i)) + for (int j = 0; j < colsB; j++) { - fprintf(stderr, "QAnsel: Thread error. (1)\n"); - exit(1); + GPU_GLOBAL_ID_0 = i; + GPU_GLOBAL_ID_1 = j; + kernel_dot(ptrR, ptrA, ptrB, rowsA, colsB, shared); } } +} - for (uint32_t i = 0; i < share.Loops; i++) +void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB) +{ + for (int i = 0; i < rowsR; i++) { - pthread_t tid; - tid = threads[i]; - if (pthread_join(tid, NULL)) + for (int j = 0; j < colsR; j++) { - fprintf(stderr, "QAnsel: Thread error. (2)\n"); + GPU_GLOBAL_ID_0 = i; + GPU_GLOBAL_ID_1 = j; + kernel_knk(ptrR, ptrA, ptrB, rowsR, colsR, rowsA, colsA, rowsB, colsB); } } } -void cpx_ncpx_knk -( - float* ptrR, - size_t rowsR, - size_t colsR, - float* ptrA, - size_t rowsA, - size_t colsA, - float* ptrB, - size_t rowsB, - size_t colsB -) +#ifdef GPU_ENABLED +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#define CL_TARGET_OPENCL_VERSION 300 +#include +#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); } +#include "kernel.cl.c" +cl_platform_id cpx_mtx_platform_id; +cl_device_id cpx_mtx_device_id; +cl_context cpx_mtx_context; +cl_command_queue cpx_mtx_command_queue; +unsigned char* cpx_mtx_cache = NULL; +size_t cpx_mtx_cache_len = 0; +#endif + +uint8_t cpx_mtx_begin() { - for (size_t rowR = 0; rowR < rowsR; rowR++) - { - for (size_t colR = 0; colR < colsR; colR++) - { - size_t rowA = rowR / rowsB; - size_t colA = colR / colsB; - size_t rowB = rowR % rowsB; - size_t colB = colR % colsB; - - float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; - float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; - float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; - float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; - - float first = r1 * r2; //real - float outer = r1 * i2; //imaginary - float inner = i1 * r2; //imaginary - float last = -(i1 * i2); //real - r1 = first + last; - i1 = outer + inner; - - ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; - ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; - ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; - ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; - - } - } + #ifdef GPU_ENABLED + cl_uint count; + cl_int err; + + err = clGetPlatformIDs(1, &cpx_mtx_platform_id, &count); + if (err != CL_SUCCESS || count == 0) + { + if (err == 0) + fprintf(stderr, "GPU error: No supported platforms found.\n"); + else + fprintf(stderr, "GPU error: clGetPlatformIDs() failed.\n"); + return 0; + } + + err = clGetDeviceIDs(cpx_mtx_platform_id, CL_DEVICE_TYPE_GPU, 1, &cpx_mtx_device_id, &count); + if (err != CL_SUCCESS || count == 0) + { + if (count == 0) + fprintf(stderr, "GPU error: No supported GPUs found.\n"); + else + fprintf(stderr, "GPU error: clGetDeviceIDs() failed.\n"); + return 0; + } + + cpx_mtx_context = clCreateContext(NULL, 1, &cpx_mtx_device_id, NULL, NULL, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU error: clCreateContext() failed.\n"); + return 0; + } + + cpx_mtx_command_queue = clCreateCommandQueue(cpx_mtx_context, cpx_mtx_device_id, 0, &err); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU error: clCreateCommandQueue() failed.\n"); + err = clReleaseContext(cpx_mtx_context); + if (err != CL_SUCCESS) + fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); + return 0; + } + #endif + return 1; } -void cpx_mtx_knk(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b) +void cpx_mtx_clean() { - size_t rowsA = a->rows; - size_t colsA = a->cols; - size_t rowsB = b->rows; - size_t colsB = b->cols; - size_t rowsR = rowsA * rowsB; - size_t colsR = colsA * colsB; - for (size_t rowR = 0; rowR < rowsR; rowR++) - { - for (size_t colR = 0; colR < colsR; colR++) - { - size_t rowA = rowR / rowsB; - size_t colA = colR / colsB; - size_t rowB = rowR % rowsB; - size_t colB = colR % colsB; - cpx_t n1, n2; - - cpx_mtx_get(a, rowA, colA, &n1); - cpx_mtx_get(b, rowB, colB, &n2); - cpx_mul(&n1, &n1, &n2); - - cpx_mtx_set(r, rowR, colR, &n1); - } - } + #ifdef GPU_ENABLED + cl_int err; + err = clReleaseCommandQueue(cpx_mtx_command_queue); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU error: clReleaseCommandQueue() failed.\n"); + } + err = clReleaseContext(cpx_mtx_context); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); + } + free(cpx_mtx_cache); + #endif } -void cpx_mtx_print(cpx_mtx_t* m) +#ifdef GPU_ENABLED +void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared) { - for (size_t r = 0; r < m->rows; r++) - { - if (r > 0) printf("\n"); - for (size_t c = 0; c < m->cols; c++) - { - cpx_t n; - cpx_mtx_get(m, r, c, &n); - uint8_t* s = cpx_str(&n); - if (c > 0) printf(", "); - printf("%s", s); - free(s); - } - } + int colsA = shared; + int rowsB = shared; + int rowsR = rowsA; + int colsR = colsB; + + //Create buffers + size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); + size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); + size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float); + cl_int err; + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); + gpuerr(clCreateBuffer); + cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); + gpuerr(clCreateBuffer); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); + gpuerr(clCreateBuffer); + + //Populate buffers + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); + gpuerr(clEnqueueWriteBuffer); + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); + gpuerr(clEnqueueWriteBuffer); + + //Load and compile program + cl_program program; + if (cpx_mtx_cache == NULL) + { + program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel_cl}, NULL, &err); + gpuerr(clCreateProgramWithSource); + err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); + gpuerr(clGetProgramInfo); + cpx_mtx_cache = malloc(cpx_mtx_cache_len); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); + gpuerr(clGetProgramInfo); + } + else + { + program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); + gpuerr(clCreateProgramWithBinary); + } + + //Setup kernel + cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); gpuerr(clCreateKernel); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg); + + //Run the program + size_t work_size[] = {rowsR, colsR}; + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); + gpuerr(clEnqueueNDRangeKernel); + + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); + err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + + //Read results + err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); + gpuerr(clEnqueueReadBuffer); + + //Clean up + err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); + err = clReleaseProgram(program); gpuerr(clReleaseProgram); + err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); + err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); + err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); +} + +void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB) +{ + //Create buffers + size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); + size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); + size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float); + cl_int err; + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); + cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(clCreateBuffer); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); + + //Populate buffers + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); + gpuerr(clEnqueueWriteBuffer); + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); + gpuerr(clEnqueueWriteBuffer); + + //Load and compile program + cl_program program; + if (cpx_mtx_cache == NULL) + { + program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel_cl}, NULL, &err); + gpuerr(clCreateProgramWithSource); + err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); + gpuerr(clGetProgramInfo); + cpx_mtx_cache = malloc(cpx_mtx_cache_len); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); + gpuerr(clGetProgramInfo); + } + else + { + program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); + gpuerr(clCreateProgramWithBinary); + } + + //Setup kernel + cl_kernel kernel = clCreateKernel(program, "kernel_knk", &err); gpuerr(clCreateKernel); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 3, sizeof(int), &rowsR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 4, sizeof(int), &colsR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 5, sizeof(int), &rowsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 6, sizeof(int), &colsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 7, sizeof(int), &rowsB); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg); + + //Run the program + size_t work_size[] = {rowsR, colsR}; + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); + gpuerr(clEnqueueNDRangeKernel); + + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); + err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + + //Read results + err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); + gpuerr(clEnqueueReadBuffer); + + //Clean up + err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); + err = clReleaseProgram(program); gpuerr(clReleaseProgram); + err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); + err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); + err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); } +#endif #endif diff --git a/src/complex.c.old b/src/complex.c.old new file mode 100644 index 0000000..54548e3 --- /dev/null +++ b/src/complex.c.old @@ -0,0 +1,536 @@ +#ifndef __cpx__ +#define __cpx__ +#include +#include +#include +#include +#include +#include "cores.c" + +typedef struct +{ + float real, imaginary; +} cpx_t; + +typedef struct +{ + float *ptr; + size_t rows, cols; +} cpx_mtx_t; + +uint8_t* cpx_str(cpx_t* n) +{ + uint8_t* r; + size_t z; + + float rl = n->real; + float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary); + if (ig == 0) + { + z = snprintf(NULL, 0, "%f", rl); + r = malloc(z + 1); + sprintf(r, "%f", rl); + + } + else + { + uint8_t op = n->imaginary >= 0 ? '+' : '-'; + z = snprintf(NULL, 0, "%f %c %fi", rl, op, ig); + r = malloc(z + 1); + sprintf(r, "%f %c %fi", rl, op, ig); + } + return r; +} + +cpx_t cpx_new(float r, float i) +{ + cpx_t n; + n.real = r; + n.imaginary = i; + return n; +} + +void cpx_add(cpx_t* r, cpx_t* a, cpx_t* b) +{ + r->real = a->real + b->real; + r->imaginary = a->imaginary + b->imaginary; +} + +void cpx_sub(cpx_t* r, cpx_t* a, cpx_t* b) +{ + r->real = a->real - b->real; + r->imaginary = a->imaginary - b->imaginary; +} + +void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b) +{ + //FOIL + float first = a->real * b->real; //real + float outer = a->real * b->imaginary; //imaginary + float inner = a->imaginary * b->real; //imaginary + float last = -(a->imaginary * b->imaginary); //real + r->real = first + last; + r->imaginary = outer + inner; +} + +//non-complex matrix multiply +// shared = colsA = rowsB +void cpx_ncpx_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) +{ + size_t colsA = shared; + size_t rowsB = shared; + size_t rowsR = rowsA; + size_t colsR = colsB; + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + for (size_t colR = 0; colR < colsR; colR++) + { + size_t posR = colR + rowR * colsR; + size_t rowA = rowR; + size_t colB = colR; + ptrR[posR] = 0; + for (size_t i = 0; i < shared; i++) + { + size_t posA = i + rowA * colsA; + size_t posB = colB + i * colsB; + ptrR[posR] += ptrA[posA] * ptrB[posB]; + } + } + } +} + +typedef struct +{ + size_t ID; + size_t Threads; + size_t Last; + size_t Loops; + size_t Continue; + size_t BlockSize; + float* ptrR; + float* ptrA; + float* ptrB; + size_t rowsA; + size_t colsB; + size_t shared; +} cpx_mul_shared; + +void* cpx_ncpx_mmul_mtc(void *context) +{ + cpx_mul_shared* data = (cpx_mul_shared*)context; + float* ptrR = data->ptrR; + float* ptrA = data->ptrA; + float* ptrB = data->ptrB; + size_t rowsA = data->rowsA; + size_t colsB = data->colsB; + size_t shared = data->shared; + + size_t colsA = data->shared; + size_t rowsB = data->shared; + size_t rowsR = data->rowsA; + size_t colsR = data->colsB; + + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + size_t a = data->ID * data->BlockSize; + size_t b = (data->ID + 1) * data->BlockSize; + if (data->ID == data->Last) b += data->Continue; + + //printf("%i;%i\n", a, b); + + for (size_t colR = a; colR < b; colR++) + { + size_t posR = colR + rowR * colsR; + size_t rowA = rowR; + size_t colB = colR; + ptrR[posR] = 0; + for (size_t i = 0; i < data->shared; i++) + { + size_t posA = i + rowA * colsA; + size_t posB = colB + i * colsB; + data->ptrR[posR] += data->ptrA[posA] * data->ptrB[posB]; + } + } + } +} + +void cpx_ncpx_mmul_mt(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) +{ + cpx_mul_shared share; + share.Threads = get_core_count(); + share.ptrR = ptrR; + share.ptrA = ptrA; + share.ptrB = ptrB; + share.rowsA = rowsA; + share.colsB = colsB; + share.shared = shared; + if (colsB <= share.Threads) + { + share.Threads = colsB; + } + share.BlockSize = (size_t)floor(((float)colsB) / ((float)share.Threads)); + share.Loops = (size_t)floor(((float)colsB) / ((float)share.BlockSize)); + share.Last = share.Loops - 1; + share.Continue = (size_t)(((float)colsB) - ((float)share.Loops) * ((float)share.BlockSize)); + + pthread_t threads[share.Loops]; + cpx_mul_shared contexts[share.Loops]; + for (size_t i = 0; i < share.Loops; i++) + { + pthread_t tid; + threads[i] = tid; + memcpy(contexts + i, &share, sizeof(cpx_mul_shared)); + contexts[i].ID = i; + } + + for (size_t i = 0; i < share.Loops; i++) + { + if (pthread_create(threads + i, NULL, &cpx_ncpx_mmul_mtc, contexts + i)) + { + fprintf(stderr, "QAnsel: Thread error. (1)\n"); + exit(1); + } + } + + for (uint32_t i = 0; i < share.Loops; i++) + { + pthread_t tid; + tid = threads[i]; + if (pthread_join(tid, NULL)) + { + fprintf(stderr, "QAnsel: Thread error. (2)\n"); + } + } +} + +//non-complex kronecker product +void cpx_ncpx_mknk(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) +{ + size_t rowsR = rowsA * rowsB; + size_t colsR = colsA * colsB; + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + for (size_t colR = 0; colR < colsR; colR++) + { + size_t rowA = rowR / rowsB; + size_t colA = colR / colsB; + size_t rowB = rowR % rowsB; + size_t colB = colR % colsB; + printf(">%i,%i|%i<\n", colR, rowR, colR + rowR * colsR); + ptrR[colR + rowR * colsR] = + ptrA[colA + rowA * colsA] + * ptrB[colB + rowB * colsB] + ; + } + } +} + +float cpx_magsqr(cpx_t* n) +{ + return (n->real * n->real) + (n->imaginary * n->imaginary); +} + +float cpx_mag(cpx_t* n) +{ + return sqrt((n->real * n->real) + (n->imaginary * n->imaginary)); +} + +void cpx_mtx_mul(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b) +{ + r->rows = a->rows; + r->cols = b->cols; + cpx_ncpx_mmul(r->ptr, a->ptr, b->ptr, a->rows * 2, b->cols * 2, a->cols * 2); +} + +void cpx_mtx_set(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n) +{ + row *= 2; + col *= 2; + size_t cols = m->cols * 2; + m->ptr[col + row * cols] = n->real; + m->ptr[(col + 1) + row * cols] = -(n->imaginary); + m->ptr[col + (row + 1) * cols] = n->imaginary; + m->ptr[(col + 1) + (row + 1) * cols] = n->real; +} + +void cpx_mtx_set2(cpx_mtx_t* m, size_t row, size_t col, float real, float imaginary) +{ + row *= 2; + col *= 2; + size_t cols = m->cols * 2; + m->ptr[col + row * cols] = real; + m->ptr[(col + 1) + row * cols] = -imaginary; + m->ptr[col + (row + 1) * cols] = imaginary; + m->ptr[(col + 1) + (row + 1) * cols] = real; +} + +void cpx_mtx_get(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n) +{ + row *= 2; + col *= 2; + size_t cols = m->cols * 2; + + //printf("qqq\n"); + n->real = m->ptr[(col + 1) + (row + 1) * cols]; + //printf("ppp\n"); + n->imaginary = m->ptr[col + (row + 1) * cols]; + //printf("ggg\n"); +} + +float cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col) +{ + row *= 2; + col *= 2; + size_t cols = m->cols * 2; + return m->ptr[(col + 1) + (row + 1) * cols]; +} + +float cpx_mtx_get_imaginary(cpx_mtx_t* m, size_t row, size_t col) +{ + row *= 2; + col *= 2; + size_t cols = m->cols * 2; + return m->ptr[col + (row + 1) * cols]; +} + +void cpx_mtx_init(cpx_mtx_t* m, size_t rows, size_t cols) +{ + m->rows = rows; + m->cols = cols; + size_t z = (rows * 2) * (cols * 2); + m->ptr = malloc(z * sizeof(float)); + for (size_t i = 0; i < z; i++) m->ptr[i] = 0; +} + +void cpx_mtx_expand_row(cpx_mtx_t* m) +{ + m->rows += 1; + size_t z = (m->rows * 2) * (m->cols * 2); + m->ptr = realloc(m->ptr, z * sizeof(float)); +} + + +void cpx_mtx_free(cpx_mtx_t* m) +{ + if (m->ptr != NULL) free(m->ptr); + m->rows = 0; + m->cols = 0; +} + +typedef struct +{ + size_t ID; + size_t Threads; + size_t Last; + size_t Loops; + size_t Continue; + size_t BlockSize; + float* ptrR; + size_t rowsR; + size_t colsR; + float* ptrA; + size_t rowsA; + size_t colsA; + float* ptrB; + size_t rowsB; + size_t colsB; +} cpx_knk_shared; + +void* cpx_ncpx_knk_mtc(void *context) +{ + cpx_knk_shared* data = (cpx_knk_shared*)context; + float* ptrR = data->ptrR; + size_t rowsR = data->rowsR; + size_t colsR = data->colsR; + float* ptrA = data->ptrA; + size_t rowsA = data->rowsA; + size_t colsA = data->colsA; + float* ptrB = data->ptrB; + size_t rowsB = data->rowsB; + size_t colsB = data->colsB; + + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + size_t a = data->ID * data->BlockSize; + size_t b = (data->ID + 1) * data->BlockSize; + if (data->ID == data->Last) b += data->Continue; + for (size_t colR = a; colR < b; colR++) + { + size_t rowA = rowR / rowsB; + size_t colA = colR / colsB; + size_t rowB = rowR % rowsB; + size_t colB = colR % colsB; + + float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; + float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; + float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; + float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; + + float first = r1 * r2; //real + float outer = r1 * i2; //imaginary + float inner = i1 * r2; //imaginary + float last = -(i1 * i2); //real + r1 = first + last; + i1 = outer + inner; + + ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; + ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; + ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; + ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; + + } + } +} + +void cpx_ncpx_knk_mt +( + float* ptrR, + size_t rowsR, + size_t colsR, + float* ptrA, + size_t rowsA, + size_t colsA, + float* ptrB, + size_t rowsB, + size_t colsB +) +{ + cpx_knk_shared share; + share.Threads = get_core_count(); + share.ptrR = ptrR; + share.rowsR = rowsR; + share.colsR = colsR; + share.ptrA = ptrA; + share.rowsA = rowsA; + share.colsA = colsA; + share.ptrB = ptrB; + share.rowsB = rowsB; + share.colsB = colsB; + + if (colsR <= share.Threads) + { + share.Threads = colsR; + } + share.BlockSize = (size_t)floor(((float)colsR) / ((float)share.Threads)); + share.Loops = (size_t)floor(((float)colsR) / ((float)share.BlockSize)); + share.Last = share.Loops - 1; + share.Continue = (size_t)(((float)colsR) - ((float)share.Loops) * ((float)share.BlockSize)); + + pthread_t threads[share.Loops]; + cpx_knk_shared contexts[share.Loops]; + for (size_t i = 0; i < share.Loops; i++) + { + pthread_t tid; + threads[i] = tid; + memcpy(contexts + i, &share, sizeof(cpx_knk_shared)); + contexts[i].ID = i; + } + + for (size_t i = 0; i < share.Loops; i++) + { + if (pthread_create(threads + i, NULL, &cpx_ncpx_knk_mtc, contexts + i)) + { + fprintf(stderr, "QAnsel: Thread error. (1)\n"); + exit(1); + } + } + + for (uint32_t i = 0; i < share.Loops; i++) + { + pthread_t tid; + tid = threads[i]; + if (pthread_join(tid, NULL)) + { + fprintf(stderr, "QAnsel: Thread error. (2)\n"); + } + } +} + +void cpx_ncpx_knk +( + float* ptrR, + size_t rowsR, + size_t colsR, + float* ptrA, + size_t rowsA, + size_t colsA, + float* ptrB, + size_t rowsB, + size_t colsB +) +{ + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + for (size_t colR = 0; colR < colsR; colR++) + { + size_t rowA = rowR / rowsB; + size_t colA = colR / colsB; + size_t rowB = rowR % rowsB; + size_t colB = colR % colsB; + + float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; + float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; + float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; + float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; + + float first = r1 * r2; //real + float outer = r1 * i2; //imaginary + float inner = i1 * r2; //imaginary + float last = -(i1 * i2); //real + r1 = first + last; + i1 = outer + inner; + + ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; + ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; + ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; + ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; + + } + } +} + +void cpx_mtx_knk(cpx_mtx_t* r, cpx_mtx_t* a, cpx_mtx_t* b) +{ + size_t rowsA = a->rows; + size_t colsA = a->cols; + size_t rowsB = b->rows; + size_t colsB = b->cols; + size_t rowsR = rowsA * rowsB; + size_t colsR = colsA * colsB; + for (size_t rowR = 0; rowR < rowsR; rowR++) + { + for (size_t colR = 0; colR < colsR; colR++) + { + size_t rowA = rowR / rowsB; + size_t colA = colR / colsB; + size_t rowB = rowR % rowsB; + size_t colB = colR % colsB; + cpx_t n1, n2; + + cpx_mtx_get(a, rowA, colA, &n1); + cpx_mtx_get(b, rowB, colB, &n2); + cpx_mul(&n1, &n1, &n2); + + cpx_mtx_set(r, rowR, colR, &n1); + } + } +} + +void cpx_mtx_print(cpx_mtx_t* m) +{ + for (size_t r = 0; r < m->rows; r++) + { + if (r > 0) printf("\n"); + for (size_t c = 0; c < m->cols; c++) + { + cpx_t n; + cpx_mtx_get(m, r, c, &n); + uint8_t* s = cpx_str(&n); + if (c > 0) printf(", "); + printf("%s", s); + free(s); + } + } +} + +#endif diff --git a/src/complex2.c b/src/complex2.c deleted file mode 100644 index ae3677c..0000000 --- a/src/complex2.c +++ /dev/null @@ -1,411 +0,0 @@ -#ifndef __cpx__ -#define __cpx__ -#include -#include -#include -#include -#include -#include "cores.c" -#define GPU_ENABLED -typedef struct -{ - float real, imaginary; -} cpx_t; - -typedef struct -{ - float *ptr; - int rows, cols; -} cpx_mtx_t; - -uint8_t* cpx_str(cpx_t* n) -{ - uint8_t* r; - int z; - - float rl = n->real; - float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary); - if (ig == 0) - { - z = snprintf(NULL, 0, "%f", rl); - r = malloc(z + 1); - sprintf(r, "%f", rl); - - } - else - { - uint8_t op = n->imaginary >= 0 ? '+' : '-'; - z = snprintf(NULL, 0, "%f %c %fi", rl, op, ig); - r = malloc(z + 1); - sprintf(r, "%f %c %fi", rl, op, ig); - } - return r; -} - -cpx_t cpx_new(float r, float i) -{ - cpx_t n; - n.real = r; - n.imaginary = i; - return n; -} - -void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b) -{ - //FOIL - float first = a->real * b->real; //real - float outer = a->real * b->imaginary; //imaginary - float inner = a->imaginary * b->real; //imaginary - float last = -(a->imaginary * b->imaginary); //real - r->real = first + last; - r->imaginary = outer + inner; -} - -float cpx_magsqr(cpx_t* n) -{ - return (n->real * n->real) + (n->imaginary * n->imaginary); -} - -float cpx_mag(cpx_t* n) -{ - return sqrt((n->real * n->real) + (n->imaginary * n->imaginary)); -} - -void cpx_mtx_set(cpx_mtx_t* m, int row, int col, cpx_t* n) -{ - m->ptr[row * ((m->cols) * 2) + (col * 2)] = n->real; - m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = n->imaginary; -} - -void cpx_mtx_set2(cpx_mtx_t* m, int row, int col, float real, float imaginary) -{ - m->ptr[row * ((m->cols) * 2) + (col * 2)] = real; - m->ptr[row * ((m->cols) * 2) + (col * 2) + 1] = imaginary; -} - -void cpx_mtx_get(cpx_mtx_t* m, int row, int col, cpx_t* n) -{ - n->real = m->ptr[row * ((m->cols) * 2) + (col * 2)]; - n->imaginary = m->ptr[row * ((m->cols) * 2) + (col * 2) + 1]; -} - -void cpx_mtx_init(cpx_mtx_t* m, int rows, int cols) -{ - int z = rows * (cols * 2) * sizeof(float); - m->ptr = malloc(z); - m->rows = rows; - m->cols = cols; - memset(m->ptr, 0, z); -} - -void cpx_mtx_free(cpx_mtx_t* m) -{ - if (m->ptr != NULL) free(m->ptr); - m->rows = 0; - m->cols = 0; -} - -void cpx_mtx_print(cpx_mtx_t* m) -{ - for (int r = 0; r < m->rows; r++) - { - if (r > 0) printf("\n"); - for (int c = 0; c < m->cols; c++) - { - cpx_t n; - cpx_mtx_get(m, r, c, &n); - uint8_t* s = cpx_str(&n); - if (c > 0) printf(", "); - printf("%s", s); - free(s); - } - } -} - -//This is for testing GPU functions on the CPU -#define __kernel -#define __global -int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2; -int get_global_id(int id) -{ - switch (id) - { - case 0: return GPU_GLOBAL_ID_0; - case 1: return GPU_GLOBAL_ID_1; - case 2: return GPU_GLOBAL_ID_2; - } -} -#include "kernel2.cl" - -void cpx_mtx_dot(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) -{ - for (int i = 0; i < rowsA; i++) - { - for (int j = 0; j < colsB; j++) - { - GPU_GLOBAL_ID_0 = i; - GPU_GLOBAL_ID_1 = j; - kernel_dot(ptrR, ptrA, ptrB, rowsA, colsB, shared); - } - } -} - -void cpx_mtx_knk(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB) -{ - for (int i = 0; i < rowsR; i++) - { - for (int j = 0; j < colsR; j++) - { - GPU_GLOBAL_ID_0 = i; - GPU_GLOBAL_ID_1 = j; - kernel_knk(ptrR, ptrA, ptrB, rowsR, colsR, rowsA, colsA, rowsB, colsB); - } - } -} - -#ifdef GPU_ENABLED -#define CL_USE_DEPRECATED_OPENCL_1_2_APIS -#define CL_TARGET_OPENCL_VERSION 120 -#include -#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); } -#include "kernel2.cl.c" -cl_platform_id cpx_mtx_platform_id; -cl_device_id cpx_mtx_device_id; -cl_context cpx_mtx_context; -cl_command_queue cpx_mtx_command_queue; -unsigned char* cpx_mtx_cache = NULL; -size_t cpx_mtx_cache_len = 0; -#endif - -uint8_t cpx_mtx_begin() -{ - #ifdef GPU_ENABLED - cl_uint count; - cl_int err; - - err = clGetPlatformIDs(1, &cpx_mtx_platform_id, &count); - if (err != CL_SUCCESS || count == 0) - { - if (err == 0) - fprintf(stderr, "GPU error: No supported platforms found.\n"); - else - fprintf(stderr, "GPU error: clGetPlatformIDs() failed.\n"); - return 0; - } - - err = clGetDeviceIDs(cpx_mtx_platform_id, CL_DEVICE_TYPE_GPU, 1, &cpx_mtx_device_id, &count); - if (err != CL_SUCCESS || count == 0) - { - if (count == 0) - fprintf(stderr, "GPU error: No supported GPUs found.\n"); - else - fprintf(stderr, "GPU error: clGetDeviceIDs() failed.\n"); - return 0; - } - - cpx_mtx_context = clCreateContext(NULL, 1, &cpx_mtx_device_id, NULL, NULL, &err); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clCreateContext() failed.\n"); - return 0; - } - - cpx_mtx_command_queue = clCreateCommandQueue(cpx_mtx_context, cpx_mtx_device_id, 0, &err); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clCreateCommandQueue() failed.\n"); - err = clReleaseContext(cpx_mtx_context); - if (err != CL_SUCCESS) - fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); - return 0; - } - #endif - return 1; -} - -void cpx_mtx_clean() -{ - #ifdef GPU_ENABLED - cl_int err; - err = clReleaseCommandQueue(cpx_mtx_command_queue); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clReleaseCommandQueue() failed.\n"); - } - err = clReleaseContext(cpx_mtx_context); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); - } - free(cpx_mtx_cache); - #endif -} - -#ifdef GPU_ENABLED -void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared) -{ - int colsA = shared; - int rowsB = shared; - int rowsR = rowsA; - int colsR = colsB; - - //Create buffers - size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); - size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); - size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float); - cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); - gpuerr(clCreateBuffer); - - //Populate buffers - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - - //Load and compile program - cl_program program; - if (cpx_mtx_cache == NULL) - { - program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel2_cl}, NULL, &err); - gpuerr(clCreateProgramWithSource); - err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); - cpx_mtx_cache = malloc(cpx_mtx_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); - } - else - { - program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); - } - - //Setup kernel - cl_kernel kernel = clCreateKernel(program, "kernel_dot", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg); - - //Run the program - size_t work_size[] = {rowsR, colsR}; - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); - - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); - - //Read results - err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); - - //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); -} - -void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsR, int colsR, int rowsA, int colsA, int rowsB, int colsB) -{ - //Create buffers - size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); - size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); - size_t sizeB = ((colsB * 2) * rowsB) * sizeof(float); - cl_int err; - cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memB = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeB, NULL, &err); gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_WRITE_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); - - //Populate buffers - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memB, CL_TRUE, 0, sizeB, ptrB, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - - //Load and compile program - cl_program program; - if (cpx_mtx_cache == NULL) - { - program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){src_kernel2_cl}, NULL, &err); - gpuerr(clCreateProgramWithSource); - err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); - gpuerr(clGetProgramInfo); - cpx_mtx_cache = malloc(cpx_mtx_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); - gpuerr(clGetProgramInfo); - } - else - { - program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); - } - - //Setup kernel - cl_kernel kernel = clCreateKernel(program, "kernel_knk", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &rowsR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(int), &colsR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 7, sizeof(int), &rowsB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg); - - //Run the program - size_t work_size[] = {rowsR, colsR}; - err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); - - //Wait for completion - err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); - err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); - - //Read results - err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); - - //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); -} -#endif - -#endif diff --git a/src/gates.c b/src/gates.c index 73bb74d..63bbf90 100644 --- a/src/gates.c +++ b/src/gates.c @@ -1,38 +1,26 @@ float Identity[] = { - 1, 0, 0, 0, - 0, 1, 0, 0, - - 0, 0, 1, 0, - 0, 0, 0, 1 + 1, 0, 0, 0, + 0, 0, 1, 0, }; float PauliX[] = { - 0, 0, 1, 0, - 0, 0, 0, 1, - - 1, 0, 0, 0, - 0, 1, 0, 0 + 0, 0, 1, 0, + 1, 0, 0, 0, }; float PauliY[] = { 0, 0, 0, 1, - 0, 0, -1, 0, - 0,-1, 0, 0, - 1, 0, 0, 0, }; float PauliZ[] = { - 1, 0, 0, 0, - 0, 1, 0, 0, - + 1, 0, 0, 0, 0, 0, -1, 0, - 0, 0, 0,-1 }; // 1/sqrt(2) @@ -40,42 +28,26 @@ float PauliZ[] = float Hadamard[] = { R, 0, R, 0, - 0, R, 0, R, - R, 0, -R, 0, - 0, R, 0,-R, }; float PhaseS[] = { 1, 0, 0, 0, - 0, 1, 0, 0, - 0, 0, 0,-1, - 0, 0, 1, 0 }; // 1/sqrt(2) + 1/sqrt(2)i float PhaseT[] = { 1, 0, 0, 0, - 0, 1, 0, 0, - 0, 0, R,-R, - 0, 0, R, R }; float ControlledNOT[] = { 1, 0, 0, 0, 0, 0, 0, 0, - 0, 1, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 0, 0, 0, 1, - 0, 0, 0, 0, 1, 0, 0, 0, - 0, 0, 0, 0, 0, 1, 0, 0, }; \ No newline at end of file diff --git a/src/gates2.c b/src/gates2.c deleted file mode 100644 index 63bbf90..0000000 --- a/src/gates2.c +++ /dev/null @@ -1,53 +0,0 @@ - -float Identity[] = -{ - 1, 0, 0, 0, - 0, 0, 1, 0, -}; - -float PauliX[] = -{ - 0, 0, 1, 0, - 1, 0, 0, 0, -}; - -float PauliY[] = -{ - 0, 0, 0, 1, - 0,-1, 0, 0, -}; - -float PauliZ[] = -{ - 1, 0, 0, 0, - 0, 0, -1, 0, -}; - -// 1/sqrt(2) -#define R 0.7071067811865475 -float Hadamard[] = -{ - R, 0, R, 0, - R, 0, -R, 0, -}; - -float PhaseS[] = -{ - 1, 0, 0, 0, - 0, 0, 0,-1, -}; - -// 1/sqrt(2) + 1/sqrt(2)i -float PhaseT[] = -{ - 1, 0, 0, 0, - 0, 0, R,-R, -}; - -float ControlledNOT[] = -{ - 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, - 0, 0, 0, 0, 1, 0, 0, 0, -}; \ No newline at end of file diff --git a/src/gpu/gpu.c b/src/gpu/gpu.c deleted file mode 100644 index a9e34ab..0000000 --- a/src/gpu/gpu.c +++ /dev/null @@ -1,230 +0,0 @@ -#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: " #x "() failed.\n"); exit(1); } -#include "kernel.cl.c" -cl_platform_id GPU_platform_id; -cl_device_id GPU_device_id; -cl_context GPU_context; -cl_command_queue GPU_command_queue; -unsigned char* GPU_cache = NULL; -size_t GPU_cache_len = 0; - -uint8_t GPU_init() -{ - cl_uint count; - cl_int err; - - err = clGetPlatformIDs(1, &GPU_platform_id, &count); - if (err != CL_SUCCESS || count == 0) - { - if (err == 0) - fprintf(stderr, "GPU error: No supported platforms found.\n"); - else - fprintf(stderr, "GPU error: clGetPlatformIDs() failed.\n"); - return 0; - } - - err = clGetDeviceIDs(GPU_platform_id, CL_DEVICE_TYPE_GPU, 1, &GPU_device_id, &count); - if (err != CL_SUCCESS || count == 0) - { - if (count == 0) - fprintf(stderr, "GPU error: No supported GPUs found.\n"); - else - fprintf(stderr, "GPU error: clGetDeviceIDs() failed.\n"); - return 0; - } - - GPU_context = clCreateContext(NULL, 1, &GPU_device_id, NULL, NULL, &err); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clCreateContext() failed.\n"); - return 0; - } - - GPU_command_queue = clCreateCommandQueue(GPU_context, GPU_device_id, 0, &err); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clCreateCommandQueue() failed.\n"); - err = clReleaseContext(GPU_context); - if (err != CL_SUCCESS) - fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); - return 0; - } - - return 1; -} - -void GPU_clean() -{ - cl_int err; - err = clReleaseCommandQueue(GPU_command_queue); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clReleaseCommandQueue() failed.\n"); - } - err = clReleaseContext(GPU_context); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU error: clReleaseContext() failed.\n"); - } - free(GPU_cache); -} - -void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsB, int shared) -{ - //Create buffers - size_t sizeA = rowsA * shared; - size_t sizeB = shared * colsB; - size_t sizeR = rowsA * colsB; - cl_int err; - cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err); - gpuerr(clCreateBuffer); - - //Populate buffers - err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - - //Load and compile program - cl_program program; - if (GPU_cache == NULL) - { - program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_kernel_cl}, NULL, &err); - gpuerr(clCreateProgramWithSource); - err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &GPU_cache_len, NULL); - gpuerr(clGetProgramInfo); - GPU_cache = malloc(GPU_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &GPU_cache, NULL); - gpuerr(clGetProgramInfo); - } - else - { - program = clCreateProgramWithBinary(GPU_context, 1, &GPU_device_id, &GPU_cache_len, (const unsigned char**)&GPU_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); - } - - //Setup kernel - cl_kernel kernel = clCreateKernel(program, "gpu_mmul", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(int), &colsB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(int), &shared); gpuerr(clSetKernelArg); - - //Run the program - size_t work_size[] = {rowsA, colsB}; - err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); - - //Wait for completion - err = clFlush(GPU_command_queue); gpuerr(clFlush); - err = clFinish(GPU_command_queue); gpuerr(clFinish); - - //Read results - err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); - - //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); -} - -void GPU_knk(float* ptrR, int rowsR, int colsR, float* ptrA, int rowsA, int colsA, float* ptrB, int rowsB, int colsB) -{ - //Create buffers - size_t sizeA = (rowsA * 2) * (colsA * 2); - size_t sizeB = (rowsB * 2) * (colsB * 2); - size_t sizeR = (rowsR * 2) * (colsR * 2); - cl_int err; - cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err); - gpuerr(clCreateBuffer); - cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err); - gpuerr(clCreateBuffer); - - //Populate buffers - err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL); - gpuerr(clEnqueueWriteBuffer); - - //Load and compile program - cl_program program; - if (GPU_cache == NULL) - { - program = clCreateProgramWithSource(GPU_context, 1, (const char*[]){src_gpu_kernel_cl}, NULL, &err); - gpuerr(clCreateProgramWithSource); - err = clBuildProgram(program, 1, &GPU_device_id, NULL, NULL, NULL); - if (err != CL_SUCCESS) - { - fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); - size_t log_size; - clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); - char* log = malloc(log_size); - clGetProgramBuildInfo(program, GPU_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); - printf("%s", log); - free(log); - exit(1); - } - err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &GPU_cache_len, NULL); gpuerr(clGetProgramInfo); - GPU_cache = malloc(GPU_cache_len); - clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &GPU_cache, NULL); gpuerr(clGetProgramInfo); - } - else - { - program = clCreateProgramWithBinary(GPU_context, 1, &GPU_device_id, &GPU_cache_len, (const unsigned char**)&GPU_cache, NULL, &err); - gpuerr(clCreateProgramWithBinary); - } - - //Setup kernel - cl_kernel kernel = clCreateKernel(program, "gpu_knk", &err); gpuerr(clCreateKernel); - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&memB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 3, sizeof(int), &rowsR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 4, sizeof(int), &colsR); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 5, sizeof(int), &rowsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 6, sizeof(int), &colsA); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 7, sizeof(int), &rowsB); gpuerr(clSetKernelArg); - err = clSetKernelArg(kernel, 8, sizeof(int), &colsB); gpuerr(clSetKernelArg); - - //Run the program - size_t work_size[] = {rowsR, colsR}; - err = clEnqueueNDRangeKernel(GPU_command_queue, kernel, 2, NULL, work_size, NULL, 0, NULL, NULL); - gpuerr(clEnqueueNDRangeKernel); - - //Wait for completion - err = clFlush(GPU_command_queue); gpuerr(clFlush); - err = clFinish(GPU_command_queue); gpuerr(clFinish); - - //Read results - err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL); - gpuerr(clEnqueueReadBuffer); - - //Clean up - err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); - err = clReleaseProgram(program); gpuerr(clReleaseProgram); - err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memB); gpuerr(clReleaseMemObject); - err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); -} \ No newline at end of file diff --git a/src/gpu/gpu_test.c b/src/gpu/gpu_test.c deleted file mode 100644 index be67be8..0000000 --- a/src/gpu/gpu_test.c +++ /dev/null @@ -1,48 +0,0 @@ -//This is for testing GPU functions on the CPU -#define __kernel -#define __global -int GPU_GLOBAL_ID_0, GPU_GLOBAL_ID_1, GPU_GLOBAL_ID_2; -int get_global_id(int id) -{ - switch (id) - { - case 0: return GPU_GLOBAL_ID_0; - case 1: return GPU_GLOBAL_ID_1; - case 2: return GPU_GLOBAL_ID_2; - } -} -#include "gpu_mmul.cl" -#include "gpu_knk.cl" - -void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) -{ - for (int i = 0; i < rowsA; i++) - { - for (int j = 0; j < colsB; j++) - { - GPU_GLOBAL_ID_0 = i; - GPU_GLOBAL_ID_1 = j; - gpu_mmul(ptrR, ptrA, ptrB, rowsA, colsB, shared); - } - } -} - -void GPU_knk(float* ptrR, int rowsR, int colsR, float* ptrA, int rowsA, int colsA, float* ptrB, int rowsB, int colsB) -{ - for (int i = 0; i < rowsR; i++) - { - for (int j = 0; j < colsR; j++) - { - GPU_GLOBAL_ID_0 = i; - GPU_GLOBAL_ID_1 = j; - gpu_knk(ptrR, rowsR, colsR, ptrA, rowsA, colsA, ptrB, rowsB, colsB); - } - } -} - -uint8_t GPU_init() -{ - return 1; -} - -void GPU_clean() {} diff --git a/src/gpu/kernel.cl b/src/gpu/kernel.cl deleted file mode 100644 index 90a9768..0000000 --- a/src/gpu/kernel.cl +++ /dev/null @@ -1,69 +0,0 @@ -__kernel void gpu_mmul -( - __global float* ptrR, - __global float* ptrA, - __global float* ptrB, - const int rowsA, - const int colsB, - const int shared -) -{ - const int colsA = shared; - const int rowsB = shared; - const int rowsR = rowsA; - const int colsR = colsB; - const int rowR = get_global_id(0); - const int colR = get_global_id(1); - int posA, posB; - float sum = 0; - - const int posR = colR + rowR * colsR; - - for (int i = 0; i < shared; i++) - { - int posA = i + rowR * colsA; - int posB = colR + i * colsB; - sum += ptrA[posA] * ptrB[posB]; - } - ptrR[rowR * colsR + colR] = sum; -} - -__kernel void gpu_knk -( - __global float* ptrR, - __global float* ptrA, - __global float* ptrB, - const int rowsR, - const int colsR, - const int rowsA, - const int colsA, - const int rowsB, - const int colsB -) -{ - int rowR = get_global_id(0); - int colR = get_global_id(1); - - int rowA = rowR / rowsB; - int colA = colR / colsB; - int rowB = rowR % rowsB; - int colB = colR % colsB; - - float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; - float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; - float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; - float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; - - float first = r1 * r2; //real - float outer = r1 * i2; //imaginary - float inner = i1 * r2; //imaginary - float last = -(i1 * i2); //real - r1 = first + last; - i1 = outer + inner; - - ptrR[(colR * 2) + (rowR * 2) * (colsR * 2)] = r1; - ptrR[((colR * 2) + 1) + (rowR * 2) * (colsR * 2)] = -i1; - ptrR[(colR * 2) + ((rowR * 2) + 1) * (colsR * 2)] = i1; - ptrR[((colR * 2) + 1) + ((rowR * 2) + 1) * (colsR * 2)] = r1; -} - diff --git a/src/kernel.cl b/src/kernel.cl new file mode 100644 index 0000000..c19c7a5 --- /dev/null +++ b/src/kernel.cl @@ -0,0 +1,83 @@ +__kernel void kernel_dot +( + __global float* ptrR, + __global float* ptrA, + __global float* ptrB, + const int rowsA, + const int colsB, + const int shared +) +{ + const int colsA = shared; + const int rowsB = shared; + const int rowsR = rowsA; + const int colsR = colsB; + const int rowR = get_global_id(0); + const int colR = get_global_id(1); + int posA, posB; + float rR = 0; + float iR = 0; + + const int posR = rowR * (colsR * 2) + (colR * 2); + + for (int i = 0; i < shared; i++) + { + int posA = rowR * (colsA * 2) + (i * 2); + int posB = i * (colsB * 2) + (colR * 2); + + float rA = ptrA[posA]; + float iA = ptrA[posA + 1]; + float rB = ptrB[posB]; + float iB = ptrB[posB + 1]; + + //(rA + iA)(rB + iB) + float first = rA * rB; + float outer = rA * iB; + float inner = iA * rB; + float lasts = iA * iB; + + rR += first + lasts; + iR += outer + inner; + } + ptrR[rowR * (colsR * 2) + (colR * 2)] = rR; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR; +} + +__kernel void kernel_knk +( + __global float* ptrR, + __global float* ptrA, + __global float* ptrB, + const int rowsR, + const int colsR, + const int rowsA, + const int colsA, + const int rowsB, + const int colsB +) +{ + int rowR = get_global_id(0); + int colR = get_global_id(1); + + int rowA = rowR / rowsB; + int colA = colR / colsB; + int rowB = rowR % rowsB; + int colB = colR % colsB; + + int posA = rowA * (colsA * 2) + (colA * 2); + int posB = rowB * (colsB * 2) + (colB * 2); + + float rA = ptrA[posA]; + float iA = ptrA[posA + 1]; + float rB = ptrB[posB]; + float iB = ptrB[posB + 1]; + + //(rA + iA)(rB + iB) + float first = rA * rB; + float outer = rA * iB; + float inner = iA * rB; + float lasts = iA * iB; + ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; + ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; +} + diff --git a/src/kernel2.cl b/src/kernel2.cl deleted file mode 100644 index c19c7a5..0000000 --- a/src/kernel2.cl +++ /dev/null @@ -1,83 +0,0 @@ -__kernel void kernel_dot -( - __global float* ptrR, - __global float* ptrA, - __global float* ptrB, - const int rowsA, - const int colsB, - const int shared -) -{ - const int colsA = shared; - const int rowsB = shared; - const int rowsR = rowsA; - const int colsR = colsB; - const int rowR = get_global_id(0); - const int colR = get_global_id(1); - int posA, posB; - float rR = 0; - float iR = 0; - - const int posR = rowR * (colsR * 2) + (colR * 2); - - for (int i = 0; i < shared; i++) - { - int posA = rowR * (colsA * 2) + (i * 2); - int posB = i * (colsB * 2) + (colR * 2); - - float rA = ptrA[posA]; - float iA = ptrA[posA + 1]; - float rB = ptrB[posB]; - float iB = ptrB[posB + 1]; - - //(rA + iA)(rB + iB) - float first = rA * rB; - float outer = rA * iB; - float inner = iA * rB; - float lasts = iA * iB; - - rR += first + lasts; - iR += outer + inner; - } - ptrR[rowR * (colsR * 2) + (colR * 2)] = rR; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = iR; -} - -__kernel void kernel_knk -( - __global float* ptrR, - __global float* ptrA, - __global float* ptrB, - const int rowsR, - const int colsR, - const int rowsA, - const int colsA, - const int rowsB, - const int colsB -) -{ - int rowR = get_global_id(0); - int colR = get_global_id(1); - - int rowA = rowR / rowsB; - int colA = colR / colsB; - int rowB = rowR % rowsB; - int colB = colR % colsB; - - int posA = rowA * (colsA * 2) + (colA * 2); - int posB = rowB * (colsB * 2) + (colB * 2); - - float rA = ptrA[posA]; - float iA = ptrA[posA + 1]; - float rB = ptrB[posB]; - float iB = ptrB[posB + 1]; - - //(rA + iA)(rB + iB) - float first = rA * rB; - float outer = rA * iB; - float inner = iA * rB; - float lasts = iA * iB; - ptrR[rowR * (colsR * 2) + (colR * 2)] = first + lasts; - ptrR[rowR * (colsR * 2) + (colR * 2) + 1] = outer + inner; -} - diff --git a/src/kernel2.cl.c b/src/kernel2.cl.c deleted file mode 100644 index 239eaa3..0000000 --- a/src/kernel2.cl.c +++ /dev/null @@ -1,167 +0,0 @@ -unsigned char src_kernel2_cl[] = { - 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x6b, 0x65, 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x64, 0x6f, 0x74, - 0x0a, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, - 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, - 0x74, 0x72, 0x52, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, - 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, - 0x20, 0x70, 0x74, 0x72, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, - 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, - 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, - 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, - 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x6e, 0x74, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x0a, - 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, - 0x3d, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, - 0x72, 0x6f, 0x77, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x73, 0x68, 0x61, 0x72, - 0x65, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, 0x20, - 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, - 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, - 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, - 0x28, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, - 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, - 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x2c, 0x20, 0x70, 0x6f, - 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, - 0x74, 0x20, 0x72, 0x52, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x52, 0x20, 0x3d, - 0x20, 0x30, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x52, 0x20, - 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, - 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, - 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, 0x6e, 0x74, - 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x20, 0x69, 0x20, 0x3c, 0x20, - 0x73, 0x68, 0x61, 0x72, 0x65, 0x64, 0x3b, 0x20, 0x69, 0x2b, 0x2b, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, - 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, - 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, - 0x69, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42, - 0x20, 0x3d, 0x20, 0x69, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, - 0x42, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, - 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, - 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, - 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, - 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20, - 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, - 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, - 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, 0x5b, 0x70, - 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, - 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, - 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, - 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, - 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, - 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, 0x65, - 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, - 0x61, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, - 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x52, 0x20, 0x2b, 0x3d, 0x20, 0x66, - 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x52, - 0x20, 0x2b, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, - 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, - 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, - 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, - 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x72, 0x52, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x72, 0x6f, - 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, - 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x52, - 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, - 0x20, 0x69, 0x52, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x5f, 0x5f, 0x6b, 0x65, - 0x72, 0x6e, 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x6b, 0x65, - 0x72, 0x6e, 0x65, 0x6c, 0x5f, 0x6b, 0x6e, 0x6b, 0x0a, 0x28, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, 0x52, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, - 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, 0x74, 0x72, - 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, - 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, - 0x74, 0x72, 0x42, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x52, - 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x2c, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, - 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, - 0x6c, 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, - 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x0a, 0x29, 0x0a, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, - 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, - 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x30, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, - 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, - 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x41, 0x20, 0x3d, 0x20, - 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, - 0x6c, 0x41, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2f, 0x20, - 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, - 0x77, 0x52, 0x20, 0x25, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x42, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x42, - 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x25, 0x20, 0x63, 0x6f, - 0x6c, 0x73, 0x42, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, - 0x74, 0x20, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, - 0x41, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, - 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, - 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6e, - 0x74, 0x20, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, - 0x42, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x42, 0x20, 0x2a, - 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x42, 0x20, - 0x2a, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, - 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x5d, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, - 0x20, 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, - 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, - 0x61, 0x74, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, 0x42, - 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x70, - 0x74, 0x72, 0x42, 0x5b, 0x70, 0x6f, 0x73, 0x42, 0x20, 0x2b, 0x20, 0x31, - 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, - 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, - 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, - 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x3d, 0x20, 0x72, - 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, - 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, - 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, - 0x6c, 0x61, 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, - 0x20, 0x69, 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, - 0x52, 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, - 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, - 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, - 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, - 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, - 0x5b, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, - 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x63, - 0x6f, 0x6c, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, - 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, - 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x7d, 0x00 -}; -unsigned int src_kernel2_cl_len = 1965;