OpenCL学习笔记

astrotycoon 2019-04-28

展开全文

OpenCL作为一门开源的异构并行计算语言，设计之初就是使用一种模型来模糊各种硬件差异。作为软件开发人员，我们关注的就是它的编程模型。OpenCL程序的流程大致如下：

Platform
Running time
资源回收
总结
示例代码

下面我们通过一个具体的示例程序来说明这些步骤。

使用 OpenCL API 编程与一般 C/C++ 引入第三方库编程没什么区别。所以，首先要做的自然是 include 相关的头文件。由于在 MacOS X 10.6下OpenCL的头文件命名与其他系统不同，通常使用一个#if defined进行区分，代码如下：

#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/cl.hpp>
#else
#include <CL/cl.h>
#endif

接下来我们就进入真正的编码流程了。

Platform

查询并选择一个 platform

首先我们要取得系统中所有的 OpenCL platform。所谓的 platform 指的就是硬件厂商提供的 OpenCL 框架，不同的 CPU/GPU 开发商（比如 Intel、AMD、Nvdia）可以在一个系统上分别定义自己的 OpenCL 框架。所以我们需要查询系统中可用的 OpenCL 框架，即 platform。使用 API 函数 clGetPlatformIDs 获取可用 platform 的数量：

cl_int status = 0;
cl_uint numPlatforms;
cl_platform_id platform = NULL;
status = clGetPlatformIDs( 0, NULL, &numPlatforms);

if(status != CL_SUCCESS){
    printf("Error: Getting Platforms\n");
    return EXIT_FAILURE;
}

然后根据数量来分配内存，并得到所有可用的 platform，所使用的 API 还是 clGetPlatformIDs。在 OpenCL 中，类似这样的函数调用很常见：第一次调用以取得数目，便于分配足够的内存；然后调用第二次以获取真正的信息。

if (numPlatforms > 0) {
    cl_platform_id *platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Platform Ids.(clGetPlatformIDs)\n");
        return -1;
    }

现在，所有的 platform 都存在了变量 platforms 中，接下来需要做的就是取得我们所需的 platform。本人的PC上配置的是 Intel 处理器和 AMD 显卡，专业点的说法叫 Intel 的 CPU 和 AMD 的 GPU :)。所以我这儿有两套 platform，为了体验下 GPU 的快感，所以使用 AMD 的 platform。通过使用 clGetPlatformInfo 来获得 platform 的信息。通过这个 API 可以知晓 platform 的厂商信息，以便我们选出需要的 platform。代码如下：

for (unsigned int i = 0; i < numPlatforms; ++i) {
        char pbuff[100];
        status = clGetPlatformInfo(
                     platforms[i],
                     CL_PLATFORM_VENDOR,
                     sizeof(pbuff),
                     pbuff,
                     NULL);
        platform = platforms[i];
        if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
            break;
        }
    }

不同的厂商信息可以参考 OpenCL Specifications，我这儿只是简单的筛选出 AMD 。

在 platform 上建立 context

第一步是通过 platform 得到相应的 context properties

// 如果我们能找到相应平台，就使用它，否则返回NULL
cl_context_properties cps[3] = {
    CL_CONTEXT_PLATFORM,
    (cl_context_properties)platform,
    0
};

cl_context_properties *cprops = (NULL == platform) ? NULL : cps;

第二步是通过 clCreateContextFromType 函数创建 context。

// 生成 context
cl_context context = clCreateContextFromType(
                         cprops,
                         CL_DEVICE_TYPE_GPU,
                         NULL,
                         NULL,
                         &status);
if (status != CL_SUCCESS) {
    printf("Error: Creating Context.(clCreateContexFromType)\n");
    return EXIT_FAILURE;
} 函数的第二个参数可以设定 context 关联的设备类型。本例使用的是 GPU 作为OpenCL计算设备。目前可以使用的类别包括：

- CL_DEVICE_TYPE_CPU
- CL_DEVICE_TYPE_GPU
- CL_DEVICE_TYPE_ACCELERATOR
- CL_DEVICE_TYPE_DEFAULT
- CL_DEVICE_TYPE_ALL

在 context 上查询 device ###

context 创建好之后，要做的就是查询可用的 device。

status = clGetContextInfo(context,
                          CL_CONTEXT_DEVICES,
                          0,
                          NULL,
                          &deviceListSize);
if (status != CL_SUCCESS) {
    printf("Error: Getting Context Info device list size, clGetContextInfo)\n");
    return EXIT_FAILURE;
}
cl_device_id *devices = (cl_device_id *)malloc(deviceListSize);
if (devices == 0) {
    printf("Error: No devices found.\n");
    return EXIT_FAILURE;
}

status = clGetContextInfo(context,
                          CL_CONTEXT_DEVICES,
                          deviceListSize,
                          devices,
                          NULL);
if (status != CL_SUCCESS) {
    printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
    return EXIT_FAILURE;
}

与获取 platform 类似，我们调用两次 clGetContextInfo 来完成 查询。第一次调用获取关联 context 的 device 个数，并根据个数申请内存；第二次调用获取所有 device 实例。如果你想了解每个 device 的具体信息，可以调用 clGetDeviceInfo 函数来获取，返回的信息有设备类型、生产商以及设备对某些扩展功能的支持与否等等。详细使用情况请参阅 OpenCL Specifications。

到此，platform 相关的程序已经准备就绪了，下面到此的完整代码：

/* OpenCL_01.cpp 
 * (c) by keyring <keyrings@163.com>
 * 2013.10.26
 */

#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/cl.hpp>
#else
#include <CL/cl.h>
#endif

#include <iostream>

int main(int argc, char const *argv[])
{
    printf("hello OpenCL\n");
    cl_int status = 0;
    size_t deviceListSize;

    // 得到并选择可用平台
    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (status != CL_SUCCESS) {
        printf("ERROR: Getting Platforms.(clGetPlatformIDs)\n");
        return EXIT_FAILURE;
    }

    if (numPlatforms > 0) {
        cl_platform_id *platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (status != CL_SUCCESS) {
            printf("Error: Getting Platform Ids.(clGetPlatformIDs)\n");
            return -1;
        }

		// 遍历所有 platform，选择你想用的
        for (unsigned int i = 0; i < numPlatforms; ++i) {
            char pbuff[100];
            status = clGetPlatformInfo(
                         platforms[i],
                         CL_PLATFORM_VENDOR,
                         sizeof(pbuff),
                         pbuff,
                         NULL);
            platform = platforms[i];
            if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
                break;
            }
        }

        delete platforms;
    }

    // 如果我们能找到相应平台，就使用它，否则返回NULL
    cl_context_properties cps[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    cl_context_properties *cprops = (NULL == platform) ? NULL : cps;


    // 生成 context
    cl_context context = clCreateContextFromType(
                             cprops,
                             CL_DEVICE_TYPE_GPU,
                             NULL,
                             NULL,
                             &status);
    if (status != CL_SUCCESS) {
        printf("Error: Creating Context.(clCreateContexFromType)\n");
        return EXIT_FAILURE;
    }

    // 寻找OpenCL设备

    // 首先得到设备列表的长度
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              0,
                              NULL,
                              &deviceListSize);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info device list size, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }
    cl_device_id *devices = (cl_device_id *)malloc(deviceListSize);
    if (devices == 0) {
        printf("Error: No devices found.\n");
        return EXIT_FAILURE;
    }

    // 然后得到设备列表
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              deviceListSize,
                              devices,
                              NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }

Running time ##

前面写了一大篇，其实还没真正进入具体的程序逻辑中，顶多算配好了 OpenCL 运行环境。真正的逻辑代码，即程序的任务就是运行时模块。本例的任务是在一个 4×4的二维空间上，按一定的规则给每个元素赋值，具体代码如下：

#define KERNEL(...)#__VA_ARGS__

const char *kernelSourceCode = KERNEL(
                                   __kernel void hellocl(__global uint *buffer)
{
    size_t gidx = get_global_id(0);
    size_t gidy = get_global_id(1);
    size_t lidx = get_local_id(0);
    buffer[gidx + 4 * gidy] = (1 << gidx) | (0x10 << gidy);

}
                               );

这一段就是我们真正的逻辑，也就是代码要干的事。使用的是 OpenCL 自定的一门类C语言，具体的语法什么的现在先不纠结。这段代码是直接嵌入我们的 cpp 文件的静态字符串。你也可以将 kernel 程序单独写成一个文件。

加载 OpenCL 内核程序并创建一个 program 对象 ###

接下来要做的就是读入 OpenCL kernel 程序并创建一个 program 对象。

size_t sourceSize[] = {strlen(kernelSourceCode)};
cl_program program = clCreateProgramWithSource(context,
                     1,
                     &kernelSourceCode,
                     sourceSize,
                     &status);
if (status != CL_SUCCESS) {
    printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
    return EXIT_FAILURE;
}

本例中的 kernel 程序是作为静态字符串读入的（单独的文本文件也一样），所以使用的是 clCreateProgramWithSource，如果你不想让 kernel 程序让其他人看见，可以先生成二进制文件，再通过 clCreateProgramWithBinary 函数动态读入二进制文件，做一定的保密。详细请参阅 OpenCL Specifications。

为指定的 device 编译 program 中的 kernel ###

kernel 程序读入完毕，要做的自然是使用 clBuildProgram 编译 kernel：

status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
if (status != CL_SUCCESS) {
    printf("Error: Building Program (clBuildingProgram)\n");
    return EXIT_FAILURE;
}

最终，kernel 将被相应 device 上的 OpenCL 编译器编译成可执行的机器码。

创建指定名字的 kernel 对象 ###

成功编译后，可以通过 clCreateKernel 来创建一个 kernel 对象。

cl_kernel kernel = clCreateKernel(program, "hellocl", &status);
if (status != CL_SUCCESS) {
    printf("Error: Creating Kernel from program.(clCreateKernel)\n");
    return EXIT_FAILURE;
}

引号中的 hellocl 就是 kernel 对象所关联的 kernel 函数的函数名。要注意的是，每个 kernel 对象必须关联且只能关联一个包含于相应 program 对象内的 kernel 程序。实际上，用户可以在 cl 源代码中写任意多个 kernel 程序，但在执行某个 kernel 程序之前必须先建立单独的 kernel 对象，即多次调用 clCreateKernel 函数。

为 kernel 创建内存对象 ###

OpenCL 内存对象是指在 host 中创建，用于 kernel 程序的内存类型。按维度可以分为两类，一类是 buffer，一类是 image。buffer 是一维的，image 可以是二维、三维的 texture、frame-buffer 或 image。本例仅仅使用 buffer，可以通过 clCreateBuffer 函数来创建。

cl_mem outputBuffer = clCreateBuffer(
									context, 
									CL_MEM_ALLOC_HOST_PTR, 
									4 * 4 * 4, 
									NULL, 
									&status);
if (status != CL_SUCCESS) {
    printf("Error: Create Buffer, outputBuffer. (clCreateBuffer)\n");
    return EXIT_FAILURE;
}

为 kernel 设置参数 ###

使用 clSetKernelArg 函数为 kernel 设置参数。传递的参数既可以是常数，变量，也可以是内存对象。本例传递的就是内存对象。

status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputBuffer);
if (status != CL_SUCCESS) {
    printf("Error: Setting kernel argument. (clSetKernelArg)\n");
    return EXIT_FAILURE;
}

该函数每次只能设置一个参数，如有多个参数，需多次调用。而且 kernel 程序中所有的参数都必须被设置，否则在启动 kernel 程序是会报错。指定位置的参数的类型最好和对应 kernel 函数内参数类型一致，以免产生各种未知的错误。在设置好指定参数后，每次运行该 kernel 程序都会使用设置值，直到用户使用次 API 重新设置参数。

在指定的 device 上创建 command queue

command queue 用于光里将要执行的各种命令。可以通过 clCreateCommandQueue 函数创建。其中的 device 必须为 context 的关联设备，所有该 command queue 中的命令都会在这个指定的 device 上运行。

cl_command_queue commandQueue = clCreateCommandQueue(context,
                                devices[0],
                                0,
                                &status);
if (status != CL_SUCCESS) {
    printf("Error: Create Command Queue. (clCreateCommandQueue)\n");
    return EXIT_FAILURE;
}

将要执行的 kernel 放入 command queue ###

创建好 command queue 后，用户可以创建相应的命令并放入 command queue 中执行。OpenCL 提供了三种方案来创建 kernel 执行命令。最常用的即为本例所示的运行在指定工作空间上的 kernel 程序，使用了 clEnqueueNDRangeKernel 函数。

size_t globalThreads[] = {4, 4};
size_t localThreads[] = {2, 2};
status = clEnqueueNDRangeKernel(commandQueue, kernel,
                                2, NULL, globalThreads,
                                localThreads, 0,
                                NULL, NULL);
if (status != CL_SUCCESS) {
    printf("Error: Enqueueing kernel\n");
    return EXIT_FAILURE;
}

clEnqueueNDRangeKernel 函数每次只能将一个 kernel 对象放入 command queue 中，用户可以多次调用该 API 将多个 kernel 对象放置到一个 command queue 中，command queue 中的不同 kernel 对象的工作区域完全不相关。其余两个 API clEnqueueTask 和 clEnqueueNativeKernel 的用法就不多讲了，详情请参阅 OpenCL Specificarions。

最后可以用 clFinish 函数来确认一个 command queue 中所有的命令都执行完毕。函数会在 command queue 中所有 kernel 执行完毕后返回。

// 确认 command queue 中所有命令都执行完毕
status = clFinish(commandQueue);
if (status != CL_SUCCESS) {
    printf("Error: Finish command queue\n");
    return EXIT_FAILURE;
}

将结果读回 host ###

计算完毕，将结果读回 host 端。使用 clEnqueueReadBuffer 函数将 OpenCL buffer 对象中的内容读取到 host 可以访问的内存空间。

// 将内存对象中的结果读回Host
status = clEnqueueReadBuffer(commandQueue,
                             outputBuffer, CL_TRUE, 0,
                             4 * 4 * 4, outbuffer, 0, NULL, NULL);
if (status != CL_SUCCESS) {
    printf("Error: Read buffer queue\n");
    return EXIT_FAILURE;
}

当然，为了看下程序的运行效果，咱们当然得看看运行结果啦。打印一下吧：

// Host端打印结果
printf("out:\n");
for (int i = 0; i < 16; ++i) {
    printf("%x ", outbuffer[i]);
    if ((i + 1) % 4 == 0)
        printf("\n");
}

资源回收 ##

程序的最后是对所有创建的对象进行释放回收，与C/C++的内存回收同理。

// 资源回收
status = clReleaseKernel(kernel);
status = clReleaseProgram(program);
status = clReleaseMemObject(outputBuffer);
status = clReleaseCommandQueue(commandQueue);
status = clReleaseContext(context);

free(devices);
delete outbuffer;

总结 ##

这次使用一个小例子来详细说明了 OpenCL 编程的一般步骤。其实这些步骤一般都是固定的。真正需要我们注意的是 OpenCL Kernel 程序的编写。当然，合理高效的利用 API 也是一门技术活。

最后给出本实例的全部代码：

/*	OpenCL_01.cpp
 *	(c) by keyring <keyrings@163.com>
 *	2013.10.26
 */

#include <iostream>

#if defined(__APPLE__) || defined(__MACOSX)
#include <OpenCL/cl.hpp>
#else
#include <CL/cl.h>
#endif


#define KERNEL(...)#__VA_ARGS__

const char *kernelSourceCode = KERNEL(
                                   __kernel void hellocl(__global uint *buffer)
{
    size_t gidx = get_global_id(0);
    size_t gidy = get_global_id(1);
    size_t lidx = get_local_id(0);
    buffer[gidx + 4 * gidy] = (1 << gidx) | (0x10 << gidy);

}
                               );

int main(int argc, char const *argv[])
{
    printf("hello OpenCL\n");
    cl_int status = 0;
    size_t deviceListSize;

    // 得到并选择可用平台
    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);

    if (status != CL_SUCCESS) {
        printf("ERROR: Getting Platforms.(clGetPlatformIDs)\n");
        return EXIT_FAILURE;
    }

    if (numPlatforms > 0) {
        cl_platform_id *platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if (status != CL_SUCCESS) {
            printf("Error: Getting Platform Ids.(clGetPlatformIDs)\n");
            return -1;
        }

        for (unsigned int i = 0; i < numPlatforms; ++i) {
            char pbuff[100];
            status = clGetPlatformInfo(
                         platforms[i],
                         CL_PLATFORM_VENDOR,
                         sizeof(pbuff),
                         pbuff,
                         NULL);
            platform = platforms[i];
            if (!strcmp(pbuff, "Advanced Micro Devices, Inc.")) {
                break;
            }
        }

        delete platforms;
    }

    // 如果我们能找到相应平台，就使用它，否则返回NULL
    cl_context_properties cps[3] = {
        CL_CONTEXT_PLATFORM,
        (cl_context_properties)platform,
        0
    };

    cl_context_properties *cprops = (NULL == platform) ? NULL : cps;


    // 生成 context
    cl_context context = clCreateContextFromType(
                             cprops,
                             CL_DEVICE_TYPE_GPU,
                             NULL,
                             NULL,
                             &status);
    if (status != CL_SUCCESS) {
        printf("Error: Creating Context.(clCreateContexFromType)\n");
        return EXIT_FAILURE;
    }

    // 寻找OpenCL设备

    // 首先得到设备列表的长度
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              0,
                              NULL,
                              &deviceListSize);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info device list size, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }
    cl_device_id *devices = (cl_device_id *)malloc(deviceListSize);
    if (devices == 0) {
        printf("Error: No devices found.\n");
        return EXIT_FAILURE;
    }

    // 现在得到设备列表
    status = clGetContextInfo(context,
                              CL_CONTEXT_DEVICES,
                              deviceListSize,
                              devices,
                              NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Getting Context Info (device list, clGetContextInfo)\n");
        return EXIT_FAILURE;
    }


    // 装载内核程序，编译CL program ,生成CL内核实例

    size_t sourceSize[] = {strlen(kernelSourceCode)};
    cl_program program = clCreateProgramWithSource(context,
                         1,
                         &kernelSourceCode,
                         sourceSize,
                         &status);
    if (status != CL_SUCCESS) {
        printf("Error: Loading Binary into cl_program (clCreateProgramWithBinary)\n");
        return EXIT_FAILURE;
    }

    // 为指定的设备编译CL program.
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Building Program (clBuildingProgram)\n");
        return EXIT_FAILURE;
    }

    // 得到指定名字的内核实例的句柄
    cl_kernel kernel = clCreateKernel(program, "hellocl", &status);
    if (status != CL_SUCCESS) {
        printf("Error: Creating Kernel from program.(clCreateKernel)\n");
        return EXIT_FAILURE;
    }

    // 创建 OpenCL buffer 对象
    unsigned int *outbuffer = new unsigned int [4 * 4];
    memset(outbuffer, 0, 4 * 4 * 4);
    cl_mem outputBuffer = clCreateBuffer(
    	context, 
    	CL_MEM_ALLOC_HOST_PTR, 
    	4 * 4 * 4, 
    	NULL, 
    	&status);

    if (status != CL_SUCCESS) {
        printf("Error: Create Buffer, outputBuffer. (clCreateBuffer)\n");
        return EXIT_FAILURE;
    }


    //  为内核程序设置参数
    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&outputBuffer);
    if (status != CL_SUCCESS) {
        printf("Error: Setting kernel argument. (clSetKernelArg)\n");
        return EXIT_FAILURE;
    }

    // 创建一个OpenCL command queue
    cl_command_queue commandQueue = clCreateCommandQueue(context,
                                    devices[0],
                                    0,
                                    &status);
    if (status != CL_SUCCESS) {
        printf("Error: Create Command Queue. (clCreateCommandQueue)\n");
        return EXIT_FAILURE;
    }


    // 将一个kernel 放入 command queue
    size_t globalThreads[] = {4, 4};
    size_t localThreads[] = {2, 2};
    status = clEnqueueNDRangeKernel(commandQueue, kernel,
                                    2, NULL, globalThreads,
                                    localThreads, 0,
                                    NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Enqueueing kernel\n");
        return EXIT_FAILURE;
    }

    // 确认 command queue 中所有命令都执行完毕
    status = clFinish(commandQueue);
    if (status != CL_SUCCESS) {
        printf("Error: Finish command queue\n");
        return EXIT_FAILURE;
    }

    // 将内存对象中的结果读回Host
    status = clEnqueueReadBuffer(commandQueue,
                                 outputBuffer, CL_TRUE, 0,
                                 4 * 4 * 4, outbuffer, 0, NULL, NULL);
    if (status != CL_SUCCESS) {
        printf("Error: Read buffer queue\n");
        return EXIT_FAILURE;
    }

    // Host端打印结果
    printf("out:\n");
    for (int i = 0; i < 16; ++i) {
        printf("%x ", outbuffer[i]);
        if ((i + 1) % 4 == 0)
            printf("\n");
    }

    // 资源回收
    status = clReleaseKernel(kernel);
    status = clReleaseProgram(program);
    status = clReleaseMemObject(outputBuffer);
    status = clReleaseCommandQueue(commandQueue);
    status = clReleaseContext(context);

    free(devices);
    delete outbuffer;

    system("pause");

    return 0;
}