From: miha-q <> Date: Sun, 3 Mar 2024 03:16:09 +0000 (-0500) Subject: Sat Mar 2 10:16:09 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=211c3710f0d57fc8ad703202ea451aedd5c098bc;p=QAnsel.git Sat Mar 2 10:16:09 PM EST 2024 --- diff --git a/src/QAnsel.c b/src/QAnsel.c index 1d084c9..d0748da 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -23,20 +23,20 @@ typedef struct { char n[128]; uint8_t q0, q1, q2; - double arg0, arg1, arg2; + float arg0, arg1, arg2; } QInstr; -double qansel_rand_s(float s) +float qansel_rand_s(float s) { uint32_t tmp; memcpy(&tmp, &s, sizeof(uint32_t)); srand(tmp); } -double qansel_rand_h() +float qansel_rand_h() { - return ((double)rand()) / ((double)RAND_MAX); + return ((float)rand()) / ((float)RAND_MAX); } -double qansel_rand_t() +float qansel_rand_t() { if (RANDOM_FILE) { @@ -45,7 +45,7 @@ double qansel_rand_t() { num = (num << 8) | fgetc(RANDOM_FILE); } - return ((double)num) / ((double)UINT32_MAX); + return ((float)num) / ((float)UINT32_MAX); } else { @@ -55,7 +55,7 @@ double qansel_rand_t() } -double qansel_rand() +float qansel_rand() { return HIDDEN_VARIABLE ? qansel_rand_h() : qansel_rand_t(); } @@ -149,7 +149,7 @@ void qansel_toffoli(cpx_mtx_t* stateVector, uint8_t qubitCount, uint8_t bitA, ui stateVector->cols = ret.cols; } -double* qansel_unitary(double theta, double phi, double lambda) +float* qansel_unitary(float theta, float phi, float lambda) { cpx_mtx_t m; cpx_t a, b, c, d; @@ -175,7 +175,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, uint8_t qubitCount, QInstr* inst cpx_mtx_t gate; gate.rows = 2; gate.cols = 2; - double* gate_ptr; + float* gate_ptr; switch (instr->n[0]) { case 'h': gate_ptr = Hadamard; break; @@ -196,11 +196,11 @@ void qansel_instruction(cpx_mtx_t* stateVector, uint8_t qubitCount, QInstr* inst uint8_t qubit = qubitCount - (instr->q0) - 1; if (qubit == 0) { - memcpy(filter.ptr, gate_ptr, 16 * sizeof(double)); + memcpy(filter.ptr, gate_ptr, 16 * sizeof(float)); } else { - memcpy(filter.ptr, Identity, 16 * sizeof(double)); + memcpy(filter.ptr, Identity, 16 * sizeof(float)); } for (uint8_t i = 1; i < qubitCount; i++) @@ -216,7 +216,7 @@ void qansel_instruction(cpx_mtx_t* stateVector, uint8_t qubitCount, QInstr* inst tmp.rows = filter.rows * gate.rows; tmp.cols = filter.cols * gate.cols; - tmp.ptr = malloc((tmp.rows * 2) * (tmp.cols * 2) * sizeof(double)); + tmp.ptr = malloc((tmp.rows * 2) * (tmp.cols * 2) * sizeof(float)); cpx_ncpx_knk_mt ( tmp.ptr, tmp.rows, tmp.cols, @@ -265,7 +265,7 @@ uint8_t qansel_measure(cpx_mtx_t* stateVector, uint8_t qubitCount, uint8_t qubit { uint32_t qubitCountPow2 = (uint32_t)pow(2, qubitCount); cpx_t n; - double prob0 = 0; + float prob0 = 0; for (uint32_t i = 0; i < qubitCountPow2; i++) { uint8_t bit = (i >> qubit) & 1; @@ -273,9 +273,9 @@ uint8_t qansel_measure(cpx_mtx_t* stateVector, uint8_t qubitCount, uint8_t qubit if (bit == 0) prob0 += cpx_magsqr(&n); } - double r = qansel_rand(); + float r = qansel_rand(); uint8_t newBit = r < prob0 ? 0 : 1; - double probTot = 0; + float probTot = 0; for (uint32_t i = 0; i < qubitCountPow2; i++) { uint8_t bit = (i >> qubit) & 1; @@ -292,7 +292,7 @@ uint8_t qansel_measure(cpx_mtx_t* stateVector, uint8_t qubitCount, uint8_t qubit cpx_mtx_set(stateVector, 0, i, &n); } - double multiplier = sqrt(1 / probTot); + float multiplier = sqrt(1 / probTot); for (uint32_t i = 0; i < qubitCountPow2; i++) { uint8_t bit = (i >> qubit) & 1; @@ -400,7 +400,7 @@ void qansel_run(uint8_t qubitCount, uint8_t bitCount, QInstr* instr, uint8_t* re cpx_mtx_set(&tmp, 0, 0, &a); } } - double multiplier = 0; + float multiplier = 0; cpx_t n; cpx_mtx_get(&tmp, 0, 0, &n); multiplier += cpx_magsqr(&n); @@ -462,7 +462,7 @@ void qansel_run(uint8_t qubitCount, uint8_t bitCount, QInstr* instr, uint8_t* re } else if (strcmp(instr[i].n, "born") == 0) { - double prob = 0; + float prob = 0; for (uint32_t j = 0; j < qubitCountPow2; j++) { cpx_t n; @@ -538,7 +538,7 @@ void process(int argc, char** argv) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); - double seed = (double)((uint64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec); + float seed = (float)((uint64_t)ts.tv_sec * 1000000000LL + ts.tv_nsec); qansel_rand_s(seed); char** lines = malloc(0); uint32_t* lineIDs = malloc(0); @@ -709,7 +709,7 @@ void process(int argc, char** argv) || memcmp("rz(", lines[i], 3) == 0 ) { - double angle; + float angle; char ty; if (sscanf(lines[i], "r%c(%f/%f) q[%i]%n", &ty, &a0, &a1, &q0, &matchedCount) == 4) { @@ -1349,13 +1349,13 @@ void process(int argc, char** argv) } if (fullSample == 255) { - printf(": %.1f%%\n", ((double)stats[i] / (double)shots) * (double)100); + printf(": %.1f%%\n", ((float)stats[i] / (float)shots) * (float)100); } } if (fullSample != 255) { - double prob = ((double)count / (double)shots) * (double)100; - printf("0: %.1f%%\n", ((double)100)-prob); + float prob = ((float)count / (float)shots) * (float)100; + printf("0: %.1f%%\n", ((float)100)-prob); printf("1: %.1f%%\n", prob); } } diff --git a/src/complex.c b/src/complex.c index c5346b7..54548e3 100644 --- a/src/complex.c +++ b/src/complex.c @@ -9,12 +9,12 @@ typedef struct { - double real, imaginary; + float real, imaginary; } cpx_t; typedef struct { - double *ptr; + float *ptr; size_t rows, cols; } cpx_mtx_t; @@ -23,8 +23,8 @@ uint8_t* cpx_str(cpx_t* n) uint8_t* r; size_t z; - double rl = n->real; - double ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary); + float rl = n->real; + float ig = n->imaginary >= 0 ? n->imaginary : -(n->imaginary); if (ig == 0) { z = snprintf(NULL, 0, "%f", rl); @@ -42,7 +42,7 @@ uint8_t* cpx_str(cpx_t* n) return r; } -cpx_t cpx_new(double r, double i) +cpx_t cpx_new(float r, float i) { cpx_t n; n.real = r; @@ -65,17 +65,17 @@ void cpx_sub(cpx_t* r, cpx_t* a, cpx_t* b) void cpx_mul(cpx_t* r, cpx_t* a, cpx_t* b) { //FOIL - double first = a->real * b->real; //real - double outer = a->real * b->imaginary; //imaginary - double inner = a->imaginary * b->real; //imaginary - double last = -(a->imaginary * b->imaginary); //real + float first = a->real * b->real; //real + float outer = a->real * b->imaginary; //imaginary + float inner = a->imaginary * b->real; //imaginary + float last = -(a->imaginary * b->imaginary); //real r->real = first + last; r->imaginary = outer + inner; } //non-complex matrix multiply // shared = colsA = rowsB -void cpx_ncpx_mmul(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, size_t colsB, size_t shared) +void cpx_ncpx_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) { size_t colsA = shared; size_t rowsB = shared; @@ -107,9 +107,9 @@ typedef struct size_t Loops; size_t Continue; size_t BlockSize; - double* ptrR; - double* ptrA; - double* ptrB; + float* ptrR; + float* ptrA; + float* ptrB; size_t rowsA; size_t colsB; size_t shared; @@ -118,9 +118,9 @@ typedef struct void* cpx_ncpx_mmul_mtc(void *context) { cpx_mul_shared* data = (cpx_mul_shared*)context; - double* ptrR = data->ptrR; - double* ptrA = data->ptrA; - double* ptrB = data->ptrB; + float* ptrR = data->ptrR; + float* ptrA = data->ptrA; + float* ptrB = data->ptrB; size_t rowsA = data->rowsA; size_t colsB = data->colsB; size_t shared = data->shared; @@ -154,7 +154,7 @@ void* cpx_ncpx_mmul_mtc(void *context) } } -void cpx_ncpx_mmul_mt(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, size_t colsB, size_t shared) +void cpx_ncpx_mmul_mt(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) { cpx_mul_shared share; share.Threads = get_core_count(); @@ -168,10 +168,10 @@ void cpx_ncpx_mmul_mt(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, si { share.Threads = colsB; } - share.BlockSize = (size_t)floor(((double)colsB) / ((double)share.Threads)); - share.Loops = (size_t)floor(((double)colsB) / ((double)share.BlockSize)); + share.BlockSize = (size_t)floor(((float)colsB) / ((float)share.Threads)); + share.Loops = (size_t)floor(((float)colsB) / ((float)share.BlockSize)); share.Last = share.Loops - 1; - share.Continue = (size_t)(((double)colsB) - ((double)share.Loops) * ((double)share.BlockSize)); + share.Continue = (size_t)(((float)colsB) - ((float)share.Loops) * ((float)share.BlockSize)); pthread_t threads[share.Loops]; cpx_mul_shared contexts[share.Loops]; @@ -204,7 +204,7 @@ void cpx_ncpx_mmul_mt(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, si } //non-complex kronecker product -void cpx_ncpx_mknk(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) +void cpx_ncpx_mknk(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsA, size_t rowsB, size_t colsB) { size_t rowsR = rowsA * rowsB; size_t colsR = colsA * colsB; @@ -225,12 +225,12 @@ void cpx_ncpx_mknk(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, size_ } } -double cpx_magsqr(cpx_t* n) +float cpx_magsqr(cpx_t* n) { return (n->real * n->real) + (n->imaginary * n->imaginary); } -double cpx_mag(cpx_t* n) +float cpx_mag(cpx_t* n) { return sqrt((n->real * n->real) + (n->imaginary * n->imaginary)); } @@ -253,7 +253,7 @@ void cpx_mtx_set(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n) m->ptr[(col + 1) + (row + 1) * cols] = n->real; } -void cpx_mtx_set2(cpx_mtx_t* m, size_t row, size_t col, double real, double imaginary) +void cpx_mtx_set2(cpx_mtx_t* m, size_t row, size_t col, float real, float imaginary) { row *= 2; col *= 2; @@ -277,7 +277,7 @@ void cpx_mtx_get(cpx_mtx_t* m, size_t row, size_t col, cpx_t* n) //printf("ggg\n"); } -double cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col) +float cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col) { row *= 2; col *= 2; @@ -285,7 +285,7 @@ double cpx_mtx_get_real(cpx_mtx_t* m, size_t row, size_t col) return m->ptr[(col + 1) + (row + 1) * cols]; } -double cpx_mtx_get_imaginary(cpx_mtx_t* m, size_t row, size_t col) +float cpx_mtx_get_imaginary(cpx_mtx_t* m, size_t row, size_t col) { row *= 2; col *= 2; @@ -298,7 +298,7 @@ void cpx_mtx_init(cpx_mtx_t* m, size_t rows, size_t cols) m->rows = rows; m->cols = cols; size_t z = (rows * 2) * (cols * 2); - m->ptr = malloc(z * sizeof(double)); + m->ptr = malloc(z * sizeof(float)); for (size_t i = 0; i < z; i++) m->ptr[i] = 0; } @@ -306,7 +306,7 @@ void cpx_mtx_expand_row(cpx_mtx_t* m) { m->rows += 1; size_t z = (m->rows * 2) * (m->cols * 2); - m->ptr = realloc(m->ptr, z * sizeof(double)); + m->ptr = realloc(m->ptr, z * sizeof(float)); } @@ -325,13 +325,13 @@ typedef struct size_t Loops; size_t Continue; size_t BlockSize; - double* ptrR; + float* ptrR; size_t rowsR; size_t colsR; - double* ptrA; + float* ptrA; size_t rowsA; size_t colsA; - double* ptrB; + float* ptrB; size_t rowsB; size_t colsB; } cpx_knk_shared; @@ -339,13 +339,13 @@ typedef struct void* cpx_ncpx_knk_mtc(void *context) { cpx_knk_shared* data = (cpx_knk_shared*)context; - double* ptrR = data->ptrR; + float* ptrR = data->ptrR; size_t rowsR = data->rowsR; size_t colsR = data->colsR; - double* ptrA = data->ptrA; + float* ptrA = data->ptrA; size_t rowsA = data->rowsA; size_t colsA = data->colsA; - double* ptrB = data->ptrB; + float* ptrB = data->ptrB; size_t rowsB = data->rowsB; size_t colsB = data->colsB; @@ -361,15 +361,15 @@ void* cpx_ncpx_knk_mtc(void *context) size_t rowB = rowR % rowsB; size_t colB = colR % colsB; - double r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; - double i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; - double r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; - double i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; + float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; + float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; + float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; + float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; - double first = r1 * r2; //real - double outer = r1 * i2; //imaginary - double inner = i1 * r2; //imaginary - double last = -(i1 * i2); //real + float first = r1 * r2; //real + float outer = r1 * i2; //imaginary + float inner = i1 * r2; //imaginary + float last = -(i1 * i2); //real r1 = first + last; i1 = outer + inner; @@ -384,13 +384,13 @@ void* cpx_ncpx_knk_mtc(void *context) void cpx_ncpx_knk_mt ( - double* ptrR, + float* ptrR, size_t rowsR, size_t colsR, - double* ptrA, + float* ptrA, size_t rowsA, size_t colsA, - double* ptrB, + float* ptrB, size_t rowsB, size_t colsB ) @@ -411,10 +411,10 @@ void cpx_ncpx_knk_mt { share.Threads = colsR; } - share.BlockSize = (size_t)floor(((double)colsR) / ((double)share.Threads)); - share.Loops = (size_t)floor(((double)colsR) / ((double)share.BlockSize)); + share.BlockSize = (size_t)floor(((float)colsR) / ((float)share.Threads)); + share.Loops = (size_t)floor(((float)colsR) / ((float)share.BlockSize)); share.Last = share.Loops - 1; - share.Continue = (size_t)(((double)colsR) - ((double)share.Loops) * ((double)share.BlockSize)); + share.Continue = (size_t)(((float)colsR) - ((float)share.Loops) * ((float)share.BlockSize)); pthread_t threads[share.Loops]; cpx_knk_shared contexts[share.Loops]; @@ -448,13 +448,13 @@ void cpx_ncpx_knk_mt void cpx_ncpx_knk ( - double* ptrR, + float* ptrR, size_t rowsR, size_t colsR, - double* ptrA, + float* ptrA, size_t rowsA, size_t colsA, - double* ptrB, + float* ptrB, size_t rowsB, size_t colsB ) @@ -468,15 +468,15 @@ void cpx_ncpx_knk size_t rowB = rowR % rowsB; size_t colB = colR % colsB; - double r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; - double i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; - double r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; - double i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; + float r1 = ptrA[((colA * 2) + 1) + ((rowA * 2) + 1) * (colsA * 2)]; + float i1 = ptrA[(colA * 2) + ((rowA * 2) + 1) * (colsA * 2)]; + float r2 = ptrB[((colB * 2) + 1) + ((rowB * 2) + 1) * (colsB * 2)]; + float i2 = ptrB[(colB * 2) + ((rowB * 2) + 1) * (colsB * 2)]; - double first = r1 * r2; //real - double outer = r1 * i2; //imaginary - double inner = i1 * r2; //imaginary - double last = -(i1 * i2); //real + float first = r1 * r2; //real + float outer = r1 * i2; //imaginary + float inner = i1 * r2; //imaginary + float last = -(i1 * i2); //real r1 = first + last; i1 = outer + inner; diff --git a/src/gates.c b/src/gates.c index bca311d..73bb74d 100644 --- a/src/gates.c +++ b/src/gates.c @@ -1,5 +1,5 @@ -double Identity[] = +float Identity[] = { 1, 0, 0, 0, 0, 1, 0, 0, @@ -8,7 +8,7 @@ double Identity[] = 0, 0, 0, 1 }; -double PauliX[] = +float PauliX[] = { 0, 0, 1, 0, 0, 0, 0, 1, @@ -17,7 +17,7 @@ double PauliX[] = 0, 1, 0, 0 }; -double PauliY[] = +float PauliY[] = { 0, 0, 0, 1, 0, 0, -1, 0, @@ -26,7 +26,7 @@ double PauliY[] = 1, 0, 0, 0, }; -double PauliZ[] = +float PauliZ[] = { 1, 0, 0, 0, 0, 1, 0, 0, @@ -37,7 +37,7 @@ double PauliZ[] = // 1/sqrt(2) #define R 0.7071067811865475 -double Hadamard[] = +float Hadamard[] = { R, 0, R, 0, 0, R, 0, R, @@ -46,7 +46,7 @@ double Hadamard[] = 0, R, 0,-R, }; -double PhaseS[] = +float PhaseS[] = { 1, 0, 0, 0, 0, 1, 0, 0, @@ -56,7 +56,7 @@ double PhaseS[] = }; // 1/sqrt(2) + 1/sqrt(2)i -double PhaseT[] = +float PhaseT[] = { 1, 0, 0, 0, 0, 1, 0, 0, @@ -65,7 +65,7 @@ double PhaseT[] = 0, 0, R, R }; -double ControlledNOT[] = +float ControlledNOT[] = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, diff --git a/src/gpu.c b/src/gpu.c index 4d31573..ab46aae 100644 --- a/src/gpu.c +++ b/src/gpu.c @@ -64,39 +64,39 @@ void GPU_clean() } } -void GPU_mmul(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, size_t colsB, size_t shared) +void GPU_mmul(float* ptrR, float* ptrA, float* ptrB, size_t rowsA, size_t colsB, size_t shared) { //Create buffers size_t sizeA = rowsA * shared; size_t sizeB = shared * colsB; size_t sizeR = rowsA * colsB; cl_int err; - cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(double) * sizeA, NULL, &err); + cl_mem memA = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeA, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n"); exit(1); } - cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(double) * sizeB, NULL, &err); + cl_mem memB = clCreateBuffer(GPU_context, CL_MEM_READ_ONLY, sizeof(float) * sizeB, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n"); exit(1); } - cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(double) * sizeR, NULL, &err); + cl_mem memR = clCreateBuffer(GPU_context, CL_MEM_WRITE_ONLY, sizeof(float) * sizeR, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clCreateBuffer() failed.\n"); exit(1); } //Populate buffers - err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(double) * sizeA, ptrA, 0, NULL, NULL); + err = clEnqueueWriteBuffer(GPU_command_queue, memA, CL_TRUE, 0, sizeof(float) * sizeA, ptrA, 0, NULL, NULL); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n"); exit(1); } - err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(double) * sizeB, ptrB, 0, NULL, NULL); + err = clEnqueueWriteBuffer(GPU_command_queue, memB, CL_TRUE, 0, sizeof(float) * sizeB, ptrB, 0, NULL, NULL); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n"); @@ -184,7 +184,7 @@ void GPU_mmul(double* ptrR, double* ptrA, double* ptrB, size_t rowsA, size_t col exit(1); } //Read results - err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(double) * sizeR, ptrR, 0, NULL, NULL); + err = clEnqueueReadBuffer(GPU_command_queue, memR, CL_TRUE, 0, sizeof(float) * sizeR, ptrR, 0, NULL, NULL); if (err != CL_SUCCESS) { fprintf(stderr, "GPU fatal error: clEnqueueWriteBuffer() failed.\n"); diff --git a/src/gpu_mmul.cl b/src/gpu_mmul.cl index 508a391..5f649a2 100644 --- a/src/gpu_mmul.cl +++ b/src/gpu_mmul.cl @@ -1,9 +1,8 @@ -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -__kernel void gpu_mmul(__global double* ptrR, __global double* ptrA, __global double* ptrB, const int N, const int W) +__kernel void gpu_mmul(__global float* ptrR, __global float* ptrA, __global float* ptrB, const int N, const int W) { int row = get_global_id(0); int col = get_global_id(1); - double sum = 0; + float sum = 0; for (int i = 0; i < N; i++) { sum += ptrA[row * W + i] * ptrB[i * W + col];