Tuesday, January 28, 2014

OpenCL HelloWorld Example

This sample code is taken from https://developer.apple.com/library/mac/samplecode/OpenCL_Hello_World_Example/Introduction/Intro.html

and enhanced with
i) use all supported cpu and gpu devices
ii) print gpu/cpu vendor and device name and supported OpenCL version
iii) compute execution time -> gpu_time
iv) create a FAT binary for arm and x86

My testing results are
"Time taken by NVIDIA GeForce GT 650M is 0.883 ms"
"Time taken by Intel HD Graphics 4000 is 1.302 ms"
"Time taken by Intel Intel(R) Core(TM) i7-3820QM CPU @ 2.70GHz is 10.188 ms"
"Time taken by Intel Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz is 12.802 ms"
"Time taken by Apple ARM CPU Compute Device is 1.637 ms" (DATA_SIZE 102400 only)

This source code can be compiled using command line
clang -arch x86_64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -framework OpenCL hello.c -o hello
or for MacOSX10.8 Mountain Lion
clang -arch x86_64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk -framework OpenCL hello.c -o hello
or for iOS (OpenCL iOS header files are here)
clang -arch armv7 -arch arm64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS7.0.sdk -F/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS7.0.sdk/System/Library/PrivateFrameworks -I./include -framework OpenCL hello.c -o hello.arm

Then create a FAT binary and codesign
xcrun lipo hello.arm -arch x86_64 hello -create -output opencl_hello
CODESIGN_ALLOCATE="/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/codesign_allocate" ldid -S opencl_hello

hello.c    Select all
//////////////////////////////////////////////////////////////////////////////// #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <OpenCL/opencl.h> //////////////////////////////////////////////////////////////////////////////// // Use a static data size for simplicity // #if defined(__APPLE__) && defined(__MACH__) /* Apple OSX and iOS (Darwin). ------------------------------ */ #include <TargetConditionals.h> #if TARGET_IPHONE_SIMULATOR == 1 /* iOS in Xcode simulator */ #elif TARGET_OS_IPHONE == 1 #define DATA_SIZE (102400) #elif TARGET_OS_MAC == 1 #define DATA_SIZE (1024000) #endif #endif //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes the square of an input array // const char *KernelSource = "\n" \ "__kernel void square( \n" \ " __global float* input, \n" \ " __global float* output, \n" \ " const unsigned int count) \n" \ "{ \n" \ " int i = get_global_id(0); \n" \ " if(i < count) \n" \ " output[i] = input[i] * input[i]; \n" \ "} \n" \ "\n"; //////////////////////////////////////////////////////////////////////////////// int testdevice(cl_device_id device_id) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // unsigned int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; if (device_id == NULL) { printf("Failed to create a device group"); return EXIT_FAILURE; } // Get some information about the returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; size_t returned_size = 0; err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err |= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); if (err != CL_SUCCESS) return EXIT_FAILURE; printf("Connecting to %s %s...\n", vendor_name, device_name); // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // Create Queue with Profiling enabled commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; cl_event myEvent = NULL; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &myEvent); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Ensure kernel execution is finished clWaitForEvents(1 , &myEvent); // compute gpu_time // cl_ulong endTime, startTime; clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(startTime), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(endTime), &endTime, NULL); double gpu_time = endTime-startTime; // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); printf("\nTime taken by %s %s is %0.3f ms\n\n", vendor_name, device_name, gpu_time/1000000.0); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; } int main(int argc, char* const argv[]) { cl_uint num_devices, i; clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = calloc(sizeof(cl_device_id), num_devices); clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); char buf[128]; for (i = 0; i < num_devices; i++) { clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 128, buf, NULL); fprintf(stdout, "Vendor \"%s\" ", buf); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 128, buf, NULL); fprintf(stdout, "Device \"%s\" supports ", buf); clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, 128, buf, NULL); fprintf(stdout, "\"%s\"\n\n", buf); testdevice(devices[i]); } free(devices); }

The complete Project folder with Makefile (with other demo programs) is here


1 comment:

Tom Bond said...

In the OpenCL example do you have instructions on how to download and run on the iPhone? We are looking at OpenCL for iPad and iPhone but looks like it has no support by Apple and we are looking to see if we can get enough information to see if it's worth looking at further but with very little documentation don't know if that will be possible.