From: miha-q <> Date: Thu, 7 Mar 2024 22:08:34 +0000 (-0500) Subject: Thu Mar 7 05:08:34 PM EST 2024 X-Git-Url: http://www.foleosoft.com/?a=commitdiff_plain;h=e32637b7f026ccaab356f18c466ec18034233f5a;p=QAnsel.git Thu Mar 7 05:08:34 PM EST 2024 --- diff --git a/src/.kernel.tmp.1 b/src/.kernel.tmp.1 index 000b73e..e96fefc 100644 --- a/src/.kernel.tmp.1 +++ b/src/.kernel.tmp.1 @@ -270,3 +270,69 @@ __kernel void kernel_knk_2x2 } } } + +__kernel void kernel_knk_2x2x2 +( + __global float* ptrR, + __global float* ptrA, + const int rowsA, + const int colsA, + const float gate0, + const float gate1, + const float gate2, + const float gate3, + const float gate4, + const float gate5, + const float gate6, + const float gate7 +) +{ + const int rowsR = rowsA * 2; + const int colsR = colsA * 2; + const int rowR = get_global_id(0) * 2; //{gpu_only} + const int colR = get_global_id(1) * 2; //{gpu_only} + + const int rowA = rowR / 2; + const int colA = colR / 2; + const int posA = rowA * (colsA * 2) + (colA * 2); + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + + for (int i = 0; i < 4; i++) + { + float rB, iB; + switch (i) + { + case 0: rB = gate0; iB = gate1; break; + case 1: rB = gate2; iB = gate3; break; + case 2: rB = gate4; iB = gate5; break; + case 3: rB = gate6; iB = gate7; break; + } + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + switch (i) + { + case 0: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 1: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + case 2: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 3: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + } + + } +} diff --git a/src/.kernel.tmp.2 b/src/.kernel.tmp.2 index 71d43a2..7d9ee52 100644 --- a/src/.kernel.tmp.2 +++ b/src/.kernel.tmp.2 @@ -269,4 +269,70 @@ __kernel void kernel_knk_2x2 } } +} + +__kernel void kernel_knk_2x2x2 +( + __global float* ptrR, + __global float* ptrA, + const int rowsA, + const int colsA, + const float gate0, + const float gate1, + const float gate2, + const float gate3, + const float gate4, + const float gate5, + const float gate6, + const float gate7 +) +{ + const int rowsR = rowsA * 2; + const int colsR = colsA * 2; + const int rowR = get_global_id(0) * 2; //{gpu_only} + const int colR = get_global_id(1) * 2; //{gpu_only} + + const int rowA = rowR / 2; + const int colA = colR / 2; + const int posA = rowA * (colsA * 2) + (colA * 2); + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + + for (int i = 0; i < 4; i++) + { + float rB, iB; + switch (i) + { + case 0: rB = gate0; iB = gate1; break; + case 1: rB = gate2; iB = gate3; break; + case 2: rB = gate4; iB = gate5; break; + case 3: rB = gate6; iB = gate7; break; + } + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + switch (i) + { + case 0: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 1: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + case 2: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 3: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + } + + } } \ No newline at end of file diff --git a/src/QAnsel.c b/src/QAnsel.c index b2ad026..3c55cb8 100644 --- a/src/QAnsel.c +++ b/src/QAnsel.c @@ -222,7 +222,11 @@ void qansel_instruction(cpx_mtx_t* stateVector, unsigned char qubitCount, QInstr us1 = get_time(); cpx_mtx_knk_metal_2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); us2 = get_time(); - printf("\tMetal2: %lu\n", us2 - us1); + printf("\tMetal2x2: %lu\n", us2 - us1); + us1 = get_time(); + cpx_mtx_knk_metal_2x2x2(tmp.ptr, filter.ptr, gate.ptr, filter.rows, filter.cols, gate.rows, gate.cols); + us2 = get_time(); + printf("\tMetal2x2x2: %lu\n", us2 - us1); us1 = get_time(); for (int i = 0; i < filter.rows; i++) diff --git a/src/complex.c b/src/complex.c index ae87f09..9079092 100644 --- a/src/complex.c +++ b/src/complex.c @@ -651,6 +651,111 @@ void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); } + +//This only works if ptrA is NxM where both N and X are divisible by two, +// and ptrB is 2x2. If both are true, this is much more efficient than +// the standard knk_metal() function. +void cpx_mtx_knk_metal_2x2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB) +{ + int rowsR = rowsA * rowsB; + int colsR = colsA * colsB; + //Create buffers + size_t sizeR = ((colsR * 2) * rowsR) * sizeof(float); + size_t sizeA = ((colsA * 2) * rowsA) * sizeof(float); + cl_int err; + cl_mem memA = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeA, NULL, &err); gpuerr(clCreateBuffer); + cl_mem memR = clCreateBuffer(cpx_mtx_context, CL_MEM_READ_ONLY, sizeR, NULL, &err); gpuerr(clCreateBuffer); + + //Populate buffers + err = clEnqueueWriteBuffer(cpx_mtx_command_queue, memA, CL_TRUE, 0, sizeA, ptrA, 0, NULL, NULL); + gpuerr(clEnqueueWriteBuffer); + + //Load and compile program + cl_program program; + if (cpx_mtx_cache == NULL) + { + program = clCreateProgramWithSource(cpx_mtx_context, 1, (const char*[]){kernel_gpu}, NULL, &err); + gpuerr(clCreateProgramWithSource); + err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &cpx_mtx_cache_len, NULL); + gpuerr(clGetProgramInfo); + cpx_mtx_cache = malloc(cpx_mtx_cache_len); + clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &cpx_mtx_cache, NULL); + gpuerr(clGetProgramInfo); + } + else + { + program = clCreateProgramWithBinary(cpx_mtx_context, 1, &cpx_mtx_device_id, &cpx_mtx_cache_len, (const unsigned char**)&cpx_mtx_cache, NULL, &err); + gpuerr(clCreateProgramWithBinary); + err = clBuildProgram(program, 1, &cpx_mtx_device_id, NULL, NULL, NULL); + if (err != CL_SUCCESS) + { + fprintf(stderr, "GPU fatal error: clBuildProgram() failed.\n"); + size_t log_size; + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); + char* log = malloc(log_size); + clGetProgramBuildInfo(program, cpx_mtx_device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); + printf("%s", log); + free(log); + exit(1); + } + } + + //Setup kernel + + float gate0 = ptrB[0]; + float gate1 = ptrB[1]; + float gate2 = ptrB[2]; + float gate3 = ptrB[3]; + float gate4 = ptrB[4]; + float gate5 = ptrB[5]; + float gate6 = ptrB[6]; + float gate7 = ptrB[7]; + cl_kernel kernel = clCreateKernel(program, "kernel_knk_2x2x2", &err); gpuerr(clCreateKernel); + err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&memR); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&memA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 2, sizeof(int), &rowsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 3, sizeof(int), &colsA); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 4, sizeof(float), &gate0); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 5, sizeof(float), &gate1); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 6, sizeof(float), &gate2); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 7, sizeof(float), &gate3); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 8, sizeof(float), &gate4); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel, 9, sizeof(float), &gate5); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel,10, sizeof(float), &gate6); gpuerr(clSetKernelArg); + err = clSetKernelArg(kernel,11, sizeof(float), &gate7); gpuerr(clSetKernelArg); + + //Run the program + err = clEnqueueNDRangeKernel(cpx_mtx_command_queue, kernel, 1, NULL, (size_t[]){rowsR / 2, colsR / 2}, NULL, 0, NULL, NULL); + gpuerr(clEnqueueNDRangeKernel); + + //Wait for completion + err = clFlush(cpx_mtx_command_queue); gpuerr(clFlush); + err = clFinish(cpx_mtx_command_queue); gpuerr(clFinish); + + //Read results + err = clEnqueueReadBuffer(cpx_mtx_command_queue, memR, CL_TRUE, 0, sizeR, ptrR, 0, NULL, NULL); + gpuerr(clEnqueueReadBuffer); + + //Clean up + err = clReleaseKernel(kernel); gpuerr(clReleaseKernel); + err = clReleaseProgram(program); gpuerr(clReleaseProgram); + err = clReleaseMemObject(memA); gpuerr(clReleaseMemObject); + err = clReleaseMemObject(memR); gpuerr(clReleaseMemObject); +} + + /*-----------------------------------------------------------------------------------*/ diff --git a/src/kernel.cl b/src/kernel.cl index fda97b4..a22c6c3 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -283,4 +283,74 @@ __kernel void kernel_knk_2x2 } } +} + +__kernel void kernel_knk_2x2x2 +( + __global float* ptrR, + __global float* ptrA, + const int rowsA, + const int colsA, + const float gate0, + const float gate1, + const float gate2, + const float gate3, + const float gate4, + const float gate5, + const float gate6, + const float gate7 + , const int get_global_id_0 //{cpu_only} + , const int get_global_id_1 //{cpu_only} +) +{ + const int rowsR = rowsA * 2; + const int colsR = colsA * 2; + const int rowR = get_global_id(0) * 2; //{gpu_only} + const int colR = get_global_id(1) * 2; //{gpu_only} + const int rowR = get_global_id_0 * 2; //{cpu_only} + const int colR = get_global_id_1 * 2; //{cpu_only} + + const int rowA = rowR / 2; + const int colA = colR / 2; + const int posA = rowA * (colsA * 2) + (colA * 2); + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + + for (int i = 0; i < 4; i++) + { + float rB, iB; + switch (i) + { + case 0: rB = gate0; iB = gate1; break; + case 1: rB = gate2; iB = gate3; break; + case 2: rB = gate4; iB = gate5; break; + case 3: rB = gate6; iB = gate7; break; + } + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + switch (i) + { + case 0: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 1: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + case 2: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 3: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + } + + } } \ No newline at end of file diff --git a/src/kernel_cpu.cl b/src/kernel_cpu.cl index 5b15285..e63920e 100644 --- a/src/kernel_cpu.cl +++ b/src/kernel_cpu.cl @@ -277,3 +277,71 @@ void kernel_knk_2x2 } } } + +void kernel_knk_2x2x2 +( + float* ptrR, + float* ptrA, + const int rowsA, + const int colsA, + const float gate0, + const float gate1, + const float gate2, + const float gate3, + const float gate4, + const float gate5, + const float gate6, + const float gate7 + , const int get_global_id_0 //{cpu_only} + , const int get_global_id_1 //{cpu_only} +) +{ + const int rowsR = rowsA * 2; + const int colsR = colsA * 2; + const int rowR = get_global_id_0 * 2; //{cpu_only} + const int colR = get_global_id_1 * 2; //{cpu_only} + + const int rowA = rowR / 2; + const int colA = colR / 2; + const int posA = rowA * (colsA * 2) + (colA * 2); + const float rA = ptrA[posA]; + const float iA = ptrA[posA + 1]; + + for (int i = 0; i < 4; i++) + { + float rB, iB; + switch (i) + { + case 0: rB = gate0; iB = gate1; break; + case 1: rB = gate2; iB = gate3; break; + case 2: rB = gate4; iB = gate5; break; + case 3: rB = gate6; iB = gate7; break; + } + + //(rA + iA)(rB + iB) + const float first = rA * rB; + const float outer = rA * iB; + const float inner = iA * rB; + const float lasts = iA * iB; + switch (i) + { + case 0: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 1: + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 0) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + case 2: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 0) * 2) + 1] = outer + inner; + break; + case 3: + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2)] = first + lasts; + ptrR[(rowR + 1) * (colsR * 2) + ((colR + 1) * 2) + 1] = outer + inner; + break; + } + + } +} diff --git a/src/kernel_gpu.cl b/src/kernel_gpu.cl index 79fe91f..40ddd6b 100644 --- a/src/kernel_gpu.cl +++ b/src/kernel_gpu.cl @@ -669,6 +669,176 @@ unsigned char kernel_gpu[] = { 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x7d, 0x00 + 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x5f, 0x5f, 0x6b, 0x65, 0x72, 0x6e, + 0x65, 0x6c, 0x20, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x6b, 0x65, 0x72, 0x6e, + 0x65, 0x6c, 0x5f, 0x6b, 0x6e, 0x6b, 0x5f, 0x32, 0x78, 0x32, 0x78, 0x32, + 0x0a, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, 0x6c, 0x6f, + 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, 0x20, 0x70, + 0x74, 0x72, 0x52, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5f, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x2a, + 0x20, 0x70, 0x74, 0x72, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, + 0x73, 0x41, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x2c, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, + 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x30, 0x2c, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, + 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x31, 0x2c, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, + 0x61, 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x32, 0x2c, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, + 0x74, 0x20, 0x67, 0x61, 0x74, 0x65, 0x33, 0x2c, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, + 0x20, 0x67, 0x61, 0x74, 0x65, 0x34, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, + 0x67, 0x61, 0x74, 0x65, 0x35, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, + 0x61, 0x74, 0x65, 0x36, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x67, 0x61, + 0x74, 0x65, 0x37, 0x0a, 0x29, 0x0a, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, + 0x77, 0x73, 0x52, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x41, 0x20, + 0x2a, 0x20, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x52, + 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x3d, 0x20, 0x67, + 0x65, 0x74, 0x5f, 0x67, 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, + 0x28, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, + 0x67, 0x70, 0x75, 0x5f, 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, + 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x3d, 0x20, 0x67, 0x65, 0x74, 0x5f, 0x67, + 0x6c, 0x6f, 0x62, 0x61, 0x6c, 0x5f, 0x69, 0x64, 0x28, 0x31, 0x29, 0x20, + 0x2a, 0x20, 0x32, 0x3b, 0x20, 0x2f, 0x2f, 0x7b, 0x67, 0x70, 0x75, 0x5f, + 0x6f, 0x6e, 0x6c, 0x79, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x72, 0x6f, 0x77, + 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2f, 0x20, 0x32, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, + 0x69, 0x6e, 0x74, 0x20, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x3d, 0x20, 0x63, + 0x6f, 0x6c, 0x52, 0x20, 0x2f, 0x20, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x6e, 0x74, 0x20, 0x70, + 0x6f, 0x73, 0x41, 0x20, 0x3d, 0x20, 0x72, 0x6f, 0x77, 0x41, 0x20, 0x2a, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x41, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x41, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x41, 0x20, 0x3d, 0x20, + 0x70, 0x74, 0x72, 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x5d, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, + 0x6f, 0x61, 0x74, 0x20, 0x69, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x74, 0x72, + 0x41, 0x5b, 0x70, 0x6f, 0x73, 0x41, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x3b, + 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x69, + 0x6e, 0x74, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x20, 0x69, 0x20, + 0x3c, 0x20, 0x34, 0x3b, 0x20, 0x69, 0x2b, 0x2b, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x72, 0x42, 0x2c, 0x20, 0x69, 0x42, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x77, + 0x69, 0x74, 0x63, 0x68, 0x20, 0x28, 0x69, 0x29, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, + 0x30, 0x3a, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, + 0x30, 0x3b, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, + 0x31, 0x3b, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, + 0x73, 0x65, 0x20, 0x31, 0x3a, 0x20, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x67, + 0x61, 0x74, 0x65, 0x32, 0x3b, 0x20, 0x69, 0x42, 0x20, 0x3d, 0x20, 0x67, + 0x61, 0x74, 0x65, 0x33, 0x3b, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x32, 0x3a, 0x20, 0x72, 0x42, 0x20, + 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x34, 0x3b, 0x20, 0x69, 0x42, 0x20, + 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x35, 0x3b, 0x20, 0x62, 0x72, 0x65, + 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x33, 0x3a, 0x20, + 0x72, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x36, 0x3b, 0x20, + 0x69, 0x42, 0x20, 0x3d, 0x20, 0x67, 0x61, 0x74, 0x65, 0x37, 0x3b, 0x20, + 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x2f, 0x2f, 0x28, 0x72, 0x41, 0x20, 0x2b, 0x20, 0x69, 0x41, + 0x29, 0x28, 0x72, 0x42, 0x20, 0x2b, 0x20, 0x69, 0x42, 0x29, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, + 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, + 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, 0x3b, 0x0a, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, + 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6f, 0x75, 0x74, 0x65, + 0x72, 0x20, 0x3d, 0x20, 0x72, 0x41, 0x20, 0x2a, 0x20, 0x69, 0x42, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, + 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x6e, + 0x65, 0x72, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x72, 0x42, + 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, + 0x6e, 0x73, 0x74, 0x20, 0x66, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x6c, 0x61, + 0x73, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x69, 0x41, 0x20, 0x2a, 0x20, 0x69, + 0x42, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, + 0x77, 0x69, 0x74, 0x63, 0x68, 0x20, 0x28, 0x69, 0x29, 0x0a, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x73, 0x65, + 0x20, 0x30, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, + 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, + 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, + 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, + 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, + 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, + 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, + 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, + 0x30, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, + 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, + 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x31, 0x3a, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, + 0x2b, 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, + 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, + 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x32, + 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, + 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, + 0x20, 0x30, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, + 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, + 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, + 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, + 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, + 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x32, + 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, + 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, + 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, + 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x30, + 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, 0x20, 0x3d, 0x20, 0x66, 0x69, + 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x73, 0x3b, + 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, + 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x28, + 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, + 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, 0x20, 0x2b, 0x20, 0x30, 0x29, + 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, + 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x69, 0x6e, 0x6e, + 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, + 0x61, 0x73, 0x65, 0x20, 0x33, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, + 0x74, 0x72, 0x52, 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, + 0x31, 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, + 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, + 0x52, 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x5d, + 0x20, 0x3d, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x2b, 0x20, 0x6c, + 0x61, 0x73, 0x74, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x74, + 0x72, 0x52, 0x5b, 0x28, 0x72, 0x6f, 0x77, 0x52, 0x20, 0x2b, 0x20, 0x31, + 0x29, 0x20, 0x2a, 0x20, 0x28, 0x63, 0x6f, 0x6c, 0x73, 0x52, 0x20, 0x2a, + 0x20, 0x32, 0x29, 0x20, 0x2b, 0x20, 0x28, 0x28, 0x63, 0x6f, 0x6c, 0x52, + 0x20, 0x2b, 0x20, 0x31, 0x29, 0x20, 0x2a, 0x20, 0x32, 0x29, 0x20, 0x2b, + 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x6f, 0x75, 0x74, 0x65, 0x72, 0x20, + 0x2b, 0x20, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, + 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x00 }; -unsigned int kernel_gpu_len = 8045; +unsigned int kernel_gpu_len = 10091;