1차원 데이터 열에 대한 커널의 실행은
cl_int clEnqueueNDRangeKernel ( | cl_command_queue command_queue, |
cl_kernel kernel, | |
cl_uint work_dim, | |
const size_t *global_work_offset, | |
const size_t *global_work_size, | |
const size_t *local_work_size, | |
cl_uint num_events_in_wait_list, | |
const cl_event *event_wait_list, | |
cl_event *event) |
switch (param_name) {
case CL_DEVICE_MAX_COMPUTE_UNITS :
cl_uint* ret = (cl_uint*) alloca(sizeof(cl_uint) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device info for param\n");
return;
}
switch (param_name) {
case CL_DEVICE_VENDOR_ID: printf("\tVENDOR ID: 0x%x\n", *ret); break;
case CL_DEVICE_MAX_COMPUTE_UNITS: printf("\tMaximum number of parallel compute units: %d\n", *ret); break;
case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:
}
}break;
Number of detected OpenCL devices: 3
Running on CPU ........
=> Kernel name: copy2Dfloat4 with arity: 2
=> About to create command queue and enqueue this kernel...
Unable to enqueue task to command-queue: No such file or directory
void Check_clEnqueueNDRangeKernel_Error(cl_int error)
{
switch( error )
{
case CL_INVALID_PROGRAM_EXECUTABLE : printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
case CL_INVALID_COMMAND_QUEUE : printf("CL_INVALID_COMMAND_QUEUE\n"); break;
case CL_INVALID_KERNEL : printf("CL_INVALID_KERNEL\n");break;
case CL_INVALID_CONTEXT : printf("CL_INVALID_CONTEXT\n");break;
case CL_INVALID_KERNEL_ARGS : printf("CL_INVALID_KERNEL_ARGS\n"); break;
case CL_INVALID_WORK_DIMENSION : printf("CL_INVALID_WORK_DIMENSION\n"); break;
case CL_INVALID_WORK_GROUP_SIZE : printf("CL_INVALID_WORK_GROUP_SIZE\n") ; break;
case CL_INVALID_WORK_ITEM_SIZE : printf("CL_INVALID_WORK_ITEM_SIZE\n"); break ;
case CL_INVALID_GLOBAL_OFFSET : printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
case CL_OUT_OF_RESOURCES : printf("CL_OUT_OF_RESOURCES\n"); break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE : printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n");break;
case CL_INVALID_EVENT_WAIT_LIST : printf("CL_INVALID_EVENT_WAIT_LIST\n");break;
case CL_OUT_OF_HOST_MEMORY : printf("CL_OUT_OF_HOST_MEMORY\n");break;
}
}
Number of detected OpenCL devices: 3
Running on CPU ........
=> Kernel name: copy2Dfloat4 with arity: 2
=> About to create command queue and enqueue this kernel...
Unable to enqueue task to command-queue: No such file or directory
CL_INVALID_WORK_GROUP_SIZE
Program ended with exit code: 1
size_t globalThreads[2];
globalThreads[0]=1024;
globalThreads[1]=1024;
size_t localThreads[2];
localThreads[0] = 64;
localThreads[1] = 2;
cl_event evt;
error = clEnqueueNDRangeKernel( cQ, // command queue
kernels[j], // kernel
2, // work_dim
0, // global work_offset
globalThreads, // global work size
localThreads, // local work_size
0, // event related parameter
NULL, &evt);
clWaitForEvents(1, &evt);
if (error != CL_SUCCESS) {
perror("Unable to enqueue task to command-queue");
Check_clEnqueueNDRangeKernel_Error(error);
exit(1);
}
void displayDeviceDetails(cl_device_id id,
cl_device_info param_name,
const char* paramNameAsStr) {
cl_int error = 0;
size_t paramSize = 0;
error = clGetDeviceInfo( id, param_name, 0, NULL, ¶mSize );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device info for param\n");
return;
}
/* the cl_device_info are preprocessor directives defined in cl.h */
switch (param_name) {
case CL_DEVICE_TYPE: {
cl_device_type* devType = (cl_device_type*) alloca(sizeof(cl_device_type) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, devType, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device info for param\n");
return;
}
switch (*devType) {
case CL_DEVICE_TYPE_CPU : printf("CPU detected\n");break;
case CL_DEVICE_TYPE_GPU : printf("GPU detected\n");break;
case CL_DEVICE_TYPE_ACCELERATOR : printf("Accelerator detected\n");break;
case CL_DEVICE_TYPE_DEFAULT : printf("default detected\n");break;
}
}break;
case CL_DEVICE_VENDOR_ID :
case CL_DEVICE_MAX_COMPUTE_UNITS :
case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS : {
cl_uint* ret = (cl_uint*) alloca(sizeof(cl_uint) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device info for param\n");
return;
}
switch (param_name) {
case CL_DEVICE_VENDOR_ID: printf("\tVENDOR ID: 0x%x\n", *ret); break;
case CL_DEVICE_MAX_COMPUTE_UNITS: printf("\tMaximum number of parallel compute units: %d\n", *ret); break;
case CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: printf("\tMaximum dimensions for global/local work-item IDs: %d\n", *ret); break;
}
}break;
case CL_DEVICE_MAX_WORK_ITEM_SIZES : {
cl_uint maxWIDimensions;
size_t* ret = (size_t*) alloca(sizeof(size_t) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );
error = clGetDeviceInfo( id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &maxWIDimensions, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device info for param\n");
return;
}
printf("\tMaximum number of work-items in each dimension: ( ");
for(cl_int i =0; i < maxWIDimensions; ++i ) {
printf("%d ", ret[i]);
}
printf(" )\n");
}break;
case CL_DEVICE_MAX_WORK_GROUP_SIZE : {
size_t* ret = (size_t*) alloca(sizeof(size_t) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, ret, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device info for param\n");
return;
}
printf("\tMaximum number of work-items in a work-group: %d\n", *ret);
}break;
case CL_DEVICE_NAME :
case CL_DEVICE_VENDOR : {
char data[48];
error = clGetDeviceInfo( id, param_name, paramSize, data, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device name/vendor info for param\n");
return;
}
switch (param_name) {
case CL_DEVICE_NAME : printf("\tDevice name is %s\n", data);break;
case CL_DEVICE_VENDOR : printf("\tDevice vendor is %s\n", data);break;
}
} break;
case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE: {
cl_uint* size = (cl_uint*) alloca(sizeof(cl_uint) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, size, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device name/vendor info for param\n");
return;
}
printf("\tDevice global cacheline size: %d bytes\n", (*size)); break;
} break;
case CL_DEVICE_GLOBAL_MEM_SIZE:
case CL_DEVICE_MAX_MEM_ALLOC_SIZE: {
cl_ulong* size = (cl_ulong*) alloca(sizeof(cl_ulong) * paramSize);
error = clGetDeviceInfo( id, param_name, paramSize, size, NULL );
if (error != CL_SUCCESS ) {
perror("Unable to obtain device name/vendor info for param\n");
return;
}
switch (param_name) {
case CL_DEVICE_GLOBAL_MEM_SIZE: printf("\tDevice global mem: %ld mega-bytes\n", (*size)>>20); break;
case CL_DEVICE_MAX_MEM_ALLOC_SIZE: printf("\tDevice max memory allocation: %ld mega-bytes\n", (*size)>>20); break;
}
} break;
} //end of switch
}
void displayDeviceInfo(cl_platform_id platform_id,
cl_device_type dev_type) {
/* OpenCL 1.1 device types */
cl_int error = 0;
cl_uint numOfDevices = 0;
/* Determine how many devices are connected to your platform */
error = clGetDeviceIDs(platform_id, dev_type, 0, NULL, &numOfDevices);
if (error != CL_SUCCESS ) {
perror("Unable to obtain any OpenCL compliant device info");
exit(1);
}
cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices);
/* Load the information about your devices into the variable 'devices' */
error = clGetDeviceIDs(platform_id, dev_type, numOfDevices, devices, NULL);
if (error != CL_SUCCESS ) {
perror("Unable to obtain any OpenCL compliant device info");
exit(1);
}
printf("Number of detected OpenCL devices: %d\n", numOfDevices);
/* We attempt to retrieve some information about the devices. */
for(int i = 0; i < numOfDevices; ++ i ) {
displayDeviceDetails( devices[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE" );
displayDeviceDetails( devices[i], CL_DEVICE_NAME, "CL_DEVICE_NAME" );
displayDeviceDetails( devices[i], CL_DEVICE_VENDOR, "CL_DEVICE_VENDOR" );
displayDeviceDetails( devices[i], CL_DEVICE_VENDOR_ID, "CL_DEVICE_VENDOR_ID" );
displayDeviceDetails( devices[i], CL_DEVICE_MAX_MEM_ALLOC_SIZE, "CL_DEVICE_MAX_MEM_ALLOC_SIZE" );
displayDeviceDetails( devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE" );
displayDeviceDetails( devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, "CL_DEVICE_GLOBAL_MEM_SIZE" );
displayDeviceDetails( devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, "CL_DEVICE_MAX_COMPUTE_UNITS" );
displayDeviceDetails( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS" );
displayDeviceDetails( devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, "CL_DEVICE_MAX_WORK_ITEM_SIZES" );
displayDeviceDetails( devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, "CL_DEVICE_MAX_WORK_GROUP_SIZE" );
}
}
Number of detected OpenCL devices: 3
Running on CPU ........
Number of detected OpenCL devices: 3
CPU detected
Device name is Intel(R) Core(TM) i7-3615QM CPU @ 2.30GHz
Device vendor is Intel
VENDOR ID: 0xffffffff
Device max memory allocation: 4096 mega-bytes
Device global cacheline size: 6291456 bytes
Device global mem: 16384 mega-bytes
Maximum number of parallel compute units: 8
Maximum dimensions for global/local work-item IDs: 3
Maximum number of work-items in each dimension: ( 1024 1 1 )
Maximum number of work-items in a work-group: 1024
GPU detected
Device name is GeForce GT 650M
Device vendor is NVIDIA
VENDOR ID: 0x1022700
Device max memory allocation: 256 mega-bytes
Device global cacheline size: 0 bytes
Device global mem: 1024 mega-bytes
Maximum number of parallel compute units: 2
Maximum dimensions for global/local work-item IDs: 3
Maximum number of work-items in each dimension: ( 1024 1024 64 )
Maximum number of work-items in a work-group: 1024
GPU detected
Device name is HD Graphics 4000
Device vendor is Intel
VENDOR ID: 0x1024400
Device max memory allocation: 256 mega-bytes
Device global cacheline size: 0 bytes
Device global mem: 1024 mega-bytes
Maximum number of parallel compute units: 16
Maximum dimensions for global/local work-item IDs: 3
Maximum number of work-items in each dimension: ( 512 512 512 )
Maximum number of work-items in a work-group: 512
numOfKernels : 1
=> Kernel name: copy2Dfloat4 with arity: 2
=> About to create command queue and enqueue this kernel…
globalThreads[0]=1024;
globalThreads[1]=1024;
size_t localThreads[2];
localThreads[0] = 64;
localThreads[1] = 2;
cl_event evt;
error = clEnqueueNDRangeKernel(cQ,
kernels[j],
2,
0,
globalThreads,
localThreads,
0,
cl_float* h_out = (float*) malloc( sizeof(cl_float4) * DATA_SIZE); // output from device
for( int i = 0; i < DATA_SIZE; ++i) {
h_in[i] = (float)i;
}
}
cl_float* h_out = (float*) malloc( sizeof(cl_float) * DATA_SIZE); // output from device
for( int i = 0; i < DATA_SIZE; ++i) {
h_in[i] = (cl_float)i;
}
// Build a OpenCL program and do not run it.
for(cl_uint i = 0; i < numOfPlatforms; i++ ) {
cl_mem memInObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float4) * (DATA_SIZE), h_in, &error);
if(error != CL_SUCCESS) {
perror("Can't create an input buffer object");
exit(1);
}
for(int i = 0; i < numOfDevices; ++i, ++offset ) {
// 커널의 갯수 만큼 반복한다.
for(cl_uint j = 0; j < numOfKernels; j++) {
sizeof(cl_float4) * (DATA_SIZE), NULL, &error);
if(error != CL_SUCCESS) {
perror("Can't create an output buffer object");
exit(1);
}
error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &memInObj);
if (error != CL_SUCCESS) {
perror("Unable to set buffer object in kernel");
exit(1);
}
error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &memOutObj);
if (error != CL_SUCCESS) {
perror("Unable to set buffer object in kernel");
exit(1);
}
/* Enqueue the kernel to the command queue */
size_t globalThreads[3]= { 1024 , 2 , 1 };
size_t localThreads[3] = { 128 , 1 , 1 } ;
cl_event evt ;//= new cl_event[1];
error = clEnqueueNDRangeKernel(cQ,
kernels[j],
2,
0,
globalThreads,
localThreads,
0,
NULL, &evt);
clWaitForEvents(1, (const cl_event*)&evt);
error = clEnqueueReadBuffer(cQ, memOutObj,
CL_TRUE, // blocking read
0, // write from the last offset
(DATA_SIZE)*sizeof(cl_float4), // how much to copy
h_out, 0, NULL, NULL);
if ( valuesOK(h_in, h_out, DATA_SIZE) ) {
printf("Check passed!\n");
} else printf("Check failed!\n");
/* Release the resources */
clReleaseCommandQueue(cQ);
clReleaseMemObject(memOutObj);
Checking data of size: 16384
to:0.000000, from:0.000000
to:1.000000, from:1.000000
to:2.000000, from:2.000000
to:3.000000, from:3.000000
to:1.000000, from:0.000000
#define DATA_TYPE float4
/*
The following macros are convenience 'functions'
for striding across a 2-D array of coordinates (x,y)
by a factor which happens to be the width of the block
i.e. WIDTH
*/
#define A(x,y) A[(x)* WIDTH + (y)]
#define C(x,y) C[(x)* WIDTH + (y)]
__kernel void copy2Dfloat4(__global DATA_TYPE *A, __global DATA_TYPE *C)
{
int x = get_global_id(0);
int y = get_global_id(1);
// its like a vector load/store of 4 elements
C(x,y) = A(x,y);
}
size_t globalThreads[3]= { 256 , 64 , 1 };
// globalThreads[0]=1024;
// globalThreads[1]=1024;
size_t localThreads[3] = { 128 , 1 , 1 } ;
// localThreads[0] = 128;
// localThreads[1] = 128;
#define C(x,y) C[(x)* WIDTH + (y)]
int y = get_global_id(1);
#define C(x,y) C[(x)* WIDTH + (y)]
__kernel void copy2Dfloat4(__global DATA_TYPE *A, __global DATA_TYPE *C)
{
int x = get_global_id(0);
int y = get_global_id(1);
//int y = 0;
// its like a vector load/store of 4 elements
C(x,y) = A(x,y);
}
int y = get_global_id(0);
from:1020.000000, to:1020.000000
from:1021.000000, to:1021.000000
from:1022.000000, to:1022.000000
from:1023.000000, to:1023.000000
Check passed!
Program ended with exit code: 0
kernels[j],
2,
0,
globalThreads,
NULL,
0,
NULL, &evt);
Checking data of size: 16384
from:4096.000000, to:0.000000
from:4097.000000, to:0.000000
from:4098.000000, to:0.000000
from:4099.000000, to:0.000000
Check failed!
Program ended with exit code: 0
Checking data of size: 16384
from:2048.000000, to:0.000000
from:2049.000000, to:0.000000
from:2050.000000, to:0.000000
from:2051.000000, to:0.000000
Check failed!
Checking data of size: 16384
from:1024.000000, to:0.000000
from:1025.000000, to:0.000000
from:1026.000000, to:0.000000
from:1027.000000, to:0.000000
from:1024.000000, to:0.000000
from:1025.000000, to:0.000000
from:1026.000000, to:0.000000
from:1027.000000, to:0.000000
from:0.000000, to:0.000000
from:0.000000, to:0.000000
Check failed!
printf(" from:%f, to:%f\n", h_in[1024+0] ,h_out[1024+0]);
printf(" from:%f, to:%f\n", h_in[1024+1] ,h_out[1024+1]);
printf(" from:%f, to:%f\n", h_in[1024+2] ,h_out[1024+2]);
printf(" from:%f, to:%f\n", h_in[1024+3] ,h_out[1024+3]);
printf(" from:%f, to:%f\n", h_in[1024+4] ,h_out[1024+4]);
from:1025.000000, to:0.000000
from:1026.000000, to:0.000000
from:1027.000000, to:0.000000
from:0.000000, to:0.000000
…
h_in[i] = (float)(i);
}
from:1024.000000, to:0.000000
from:1025.000000, to:0.000000
from:1026.000000, to:0.000000
from:1027.000000, to:0.000000
from:1028.000000, to:0.000000
from:1029.000000, to:0.000000
Check failed!
Program ended with exit code: 0
numOfKernels : 1
=> Kernel name: copy2Dfloat4 with arity: 2
=> About to create command queue and enqueue this kernel...
[0000] Before start check data
from:1024.000000, to:1024.000000
from:1025.000000, to:1025.000000
from:1026.000000, to:1026.000000
from:1027.000000, to:1027.000000
from:1028.000000, to:1028.000000
=> Task has been enqueued successfully!
Checking data of size: 16384
Check passed!
..
from:12289.000000, to:0.000000
from:12290.000000, to:0.000000
from:12291.000000, to:0.000000
from:12292.000000, to:0.000000
from:12293.000000, to:0.000000
Check failed!
'Computer Vision' 카테고리의 다른 글
증강 현실 테스트 (0) | 2014.10.04 |
---|---|
OpenCL::Query OpenCL kernel (4) (0) | 2014.05.02 |
OpenCL Test Program (3) (0) | 2014.03.31 |
OpenCL Test Program (2) (0) | 2014.03.18 |
OpenCL test program (1) (0) | 2014.03.15 |