all:
+ cat src/kernel.cl | grep -vi '{gpu_only}' | sed -e 's/__global //' -e 's/__kernel //' > src/kernel_cpu.cl
+ cat src/kernel.cl | grep -vi '{cpu_only}' > src/.kernel.tmp.1
+ bash -c 'echo -ne "$$(cat src/.kernel.tmp.1)\x00" > src/.kernel.tmp.2'
+ xxd -i src/.kernel.tmp.2 | sed -e 's/src__kernel_tmp_2/kernel_gpu/' > src/kernel_gpu.cl
+ gcc src/main.c -g -o bin/QAnsel -lm -D__SDL2__ -D__OPENCL__ -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread
+ rm -f src/.kernel* src/kernel_*
+simple:
+ cat src/kernel.cl | grep -vi '{gpu_only}' | sed -e 's/__global //' -e 's/__kernel //' > src/kernel_cpu.cl
+ cat src/kernel.cl | grep -vi '{cpu_only}' > src/.kernel.tmp.1
+ bash -c 'echo -ne "$$(cat src/.kernel.tmp.1)\x00" > src/.kernel.tmp.2'
+ xxd -i src/.kernel.tmp.2 | sed -e 's/src__kernel_tmp_2/kernel_gpu/' > src/kernel_gpu.cl
+ gcc src/main.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread
+ rm -f src/.kernel* src/kernel_*
+metal:
cat src/kernel.cl | grep -vi '{gpu_only}' | sed -e 's/__global //' -e 's/__kernel //' > src/kernel_cpu.cl
cat src/kernel.cl | grep -vi '{cpu_only}' > src/.kernel.tmp.1
bash -c 'echo -ne "$$(cat src/.kernel.tmp.1)\x00" > src/.kernel.tmp.2'
xxd -i src/.kernel.tmp.2 | sed -e 's/src__kernel_tmp_2/kernel_gpu/' > src/kernel_gpu.cl
+ gcc src/main.c -g -o bin/QAnsel -lm -D__SDL2__ -D__OPENCL__ -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread
+ rm -f src/.kernel* src/kernel_*
- gcc src/main.c -g -o bin/QAnsel -lm -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread
+metal:
+ cat src/kernel.cl | grep -vi '{gpu_only}' | sed -e 's/__global //' -e 's/__kernel //' > src/kernel_cpu.cl
+ cat src/kernel.cl | grep -vi '{cpu_only}' > src/.kernel.tmp.1
+ bash -c 'echo -ne "$$(cat src/.kernel.tmp.1)\x00" > src/.kernel.tmp.2'
+ xxd -i src/.kernel.tmp.2 | sed -e 's/src__kernel_tmp_2/kernel_gpu/' > src/kernel_gpu.cl
+ gcc src/main.c -g -o bin/QAnsel -lm -D__SDL2__ -D__OPENCL__ -I/usr/include/SDL2 -D_REENTRANT -lSDL2 -lOpenCL -pthread
rm -f src/.kernel* src/kernel_*
+
commit:
git add src/ examples/ Makefile
git commit -m "`date`"
/*-----------------------------------------------------------------------------------*/
/*METAL*/
/*-----------------------------------------------------------------------------------*/
-
-#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
-#define CL_TARGET_OPENCL_VERSION 300
-#include <CL/cl.h>
-#define gpuerr(x) if (err != CL_SUCCESS) { fprintf(stderr, "GPU Error on line %i: %s.\n", __LINE__, clGetErrorString(x)); exit(1); }
-cl_platform_id cpx_mtx_platform_id;
-cl_device_id cpx_mtx_device_id;
-cl_context cpx_mtx_context;
-cl_command_queue cpx_mtx_command_queue;
-unsigned char* cpx_mtx_cache = NULL;
-size_t cpx_mtx_cache_len = 0;
-
-#include <CL/cl.h>
-#include <stdio.h>
-
+#ifdef __OPENCL__
const char* clGetErrorString(cl_int err)
{
switch (err)
default: return "Unknown error code";
}
}
-
+#endif
unsigned char cpx_mtx_begin(unsigned char verbose)
{
+ #ifdef __OPENCL__
cl_uint count;
cl_int err;
return 0;
}
return 1;
+ #else
+ return 0;
+ #endif
}
void cpx_mtx_clean()
{
+ #ifdef __OPENCL__
cl_int err;
err = clReleaseCommandQueue(cpx_mtx_command_queue);
if (err != CL_SUCCESS)
fprintf(stderr, "QAnsel (GPU): clReleaseContext() failed.\n");
}
free(cpx_mtx_cache);
+ #endif
}
void cpx_mtx_dot_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
+ #ifdef __OPENCL__
int rowsR = rowsA;
int colsR = colsB;
err = clReleaseMemObject(memA); gpuerr(err);
err = clReleaseMemObject(memB); gpuerr(err);
err = clReleaseMemObject(memR); gpuerr(err);
+ #else
+ cpx_mtx_dot_metal(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB);
+ #endif
}
-/*typedef struct
-{
- float* ptr;
- cl_mem* buff;
- size_t* buff_size;
- size_t offset;
- size_t count;
-} cpx_copy_context;
-
-void* cpx_copy_run(void *context)
-{
- cpx_copy_context* ccc = (cpx_copy_context*)context;
- cl_int err = clEnqueueWriteBuffer(cpx_mtx_command_queue, *(ccc->buff), CL_FALSE, ccc->offset, ccc->count, ccc->ptr, 0, NULL, NULL);
- gpuerr(err);
-}
-
-void cpx_copy(float* ptr, cl_mem* buff, size_t* buff_size)
-{
- int delimeter = (int)(*buff_size);
- int cores = get_core_count();
- int threadCount = cores;
- if (threadCount > delimeter) threadCount = delimeter;
- int delimetersPerThread = delimeter / threadCount;
- int leftOvers = delimeter % threadCount;
-
- cpx_copy_context ctxs[threadCount];
- pthread_t threads[threadCount];
-
- for (int i = 0; i < threadCount; i++)
- {
- ctxs[i].ptr = ptr;
- ctxs[i].buff = buff;
- ctxs[i].buff_size = buff_size;
- ctxs[i].offset = i * delimetersPerThread;
- ctxs[i].count = delimetersPerThread + ((i == threadCount - 1) ? leftOvers : 0);
- if (pthread_create(&(threads[i]), NULL, &cpx_copy_run, (void*)&(ctxs[i])))
- {
- fprintf(stderr, "QAnsel: Thread error. (3)\n");
- exit(1);
- }
- }
- for (unsigned int i = 0; i < threadCount; i++)
- {
- if (pthread_join(threads[i], NULL))
- {
- fprintf(stderr, "QAnsel: Thread error. (4)\n");
- }
- }
- clFlush(cpx_mtx_command_queue);
-}*/
-
void cpx_mtx_knk_metal(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
+ #ifdef __OPENCL__
int rowsR = rowsA * rowsB;
int colsR = colsA * colsB;
//Create buffers
err = clReleaseMemObject(memA); gpuerr(err);
err = clReleaseMemObject(memB); gpuerr(err);
err = clReleaseMemObject(memR); gpuerr(err);
+ #else
+ cpx_mtx_knk_metal(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB);
+ #endif
}
//This only works if ptrA is NxM where both N and X are divisible by two,
// the standard knk_metal() function.
void cpx_mtx_knk_metal_2x2(float* ptrR, float* ptrA, float* ptrB, int rowsA, int colsA, int rowsB, int colsB)
{
+ #ifdef __OPENCL__
int rowsR = rowsA * rowsB;
int colsR = colsA * colsB;
//Create buffers
err = clReleaseProgram(program); gpuerr(err);
err = clReleaseMemObject(memA); gpuerr(err);
err = clReleaseMemObject(memR); gpuerr(err);
+ #else
+ cpx_mtx_knk_metal_2x2(ptrR, ptrA, ptrB, rowsA, colsA, rowsB, colsB);
+ #endif
}