cl_vec3_add.c
/*
* This confidential and proprietary software should be used
* under the licensing agreement from Allwinner Technology.
*
* Copyright (C) 2020 Allwinner Technology Limited
* All rights reserved.
*
* Author: Albert Yu <yuxyun@allwinnertech.com>
*
* The entire notice above must be reproduced on all authorised
* copies and copies may only be made to the extent permitted
* by a licensing agreement from Allwinner Technology Limited.
*/
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#define DATA_SIZE 3
#define KERNEL_SRC_FILE "cl_vec3_add.cl"
int main(void)
{
int i;
FILE *fp;
char *source_str = NULL;
int a[DATA_SIZE] = { 1, 2, 3};
int b[DATA_SIZE] = { 4, 5, 6};
int c[DATA_SIZE] = { 0 };
size_t source_size;
cl_int cl_ret;
cl_platform_id platform_id;
cl_device_id device_id;
cl_context context;
cl_command_queue command_queue;
cl_mem mem_a, mem_b, mem_c;
cl_program program;
cl_kernel kernel;
size_t global_item_size = DATA_SIZE;
size_t local_item_size = DATA_SIZE;
fp = fopen(KERNEL_SRC_FILE, "r");
if (!fp) {
printf("Failed to open %s\n", KERNEL_SRC_FILE);
return -1;
}
if (fseek(fp, 0, SEEK_END))
goto exit;
source_size = ftell(fp);
rewind(fp);
source_str = (char*)malloc(source_size);
if (!source_str) {
printf("Failed to allocate memory for source_str\n");
goto exit;
}
/* Step 1: read kernel source code from kernel file */
source_size = fread(source_str, 1, source_size, fp);
if (source_size <= 0) {
printf("Failed to read kernel source code from kernel file\n");
goto exit;
}
/* Step 2: get the platform id */
cl_ret = clGetPlatformIDs(1, &platform_id, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clGetPlatformIDs failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 3: get the device id */
cl_ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clGetDeviceIDs failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 4: create an OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &cl_ret);
if (!context || cl_ret != CL_SUCCESS) {
printf("clCreateContext failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 5: create a command-queue on the specific device */
command_queue = clCreateCommandQueue(context, device_id, 0, &cl_ret);
if (!command_queue || cl_ret != CL_SUCCESS) {
printf("clCreateCommandQueue failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 6: allocate buffers on the device */
mem_a = clCreateBuffer(context, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(int), NULL, &cl_ret);
if (!mem_a || cl_ret != CL_SUCCESS) {
printf("Failed to allocate buffer for a, cl_ret=%d\n", cl_ret);
goto exit;
}
mem_b = clCreateBuffer(context, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(int), NULL, &cl_ret);
if (!mem_b || cl_ret != CL_SUCCESS) {
printf("Failed to allocate buffer for b, cl_ret=%d\n", cl_ret);
goto exit;
}
mem_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, DATA_SIZE * sizeof(int), NULL, &cl_ret);
if (!mem_c || cl_ret != CL_SUCCESS) {
printf("Failed to allocate buffer for c, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 7: copy the data to the device */
cl_ret = clEnqueueWriteBuffer(command_queue, mem_a, CL_TRUE, 0, DATA_SIZE * sizeof(int), a, 0, NULL, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clEnqueueWriteBuffer for a failed, cl_ret=%d\n", cl_ret);
goto exit;
}
cl_ret = clEnqueueWriteBuffer(command_queue, mem_b, CL_TRUE, 0, DATA_SIZE * sizeof(int), b, 0, NULL, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clEnqueueWriteBuffer for b failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 8: create a program object from the kernel source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &cl_ret);
if (!program || cl_ret != CL_SUCCESS) {
printf("clCreateProgramWithSource failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 9: build the program */
cl_ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clBuildProgram failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 10: create a kernel object */
kernel = clCreateKernel(program, "vec3_add", &cl_ret);
if (!kernel || cl_ret != CL_SUCCESS) {
printf("clCreateKernel failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 11: set the arguments of the kernel */
cl_ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&mem_a);
if (cl_ret != CL_SUCCESS) {
printf("clSetKernelArg for mem_a failed, cl_ret=%d\n", cl_ret);
goto exit;
}
cl_ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&mem_b);
if (cl_ret != CL_SUCCESS) {
printf("clSetKernelArg for mem_b failed, cl_ret=%d\n", cl_ret);
goto exit;
}
cl_ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&mem_c);
if (cl_ret != CL_SUCCESS) {
printf("clSetKernelArg for mem_c failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 12: enqueues a command to execute a kernel on the device */
cl_ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clEnqueueNDRangeKernel failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Step 13: read back the result from the device */
cl_ret = clEnqueueReadBuffer(command_queue, mem_c, CL_TRUE, 0, DATA_SIZE * sizeof(int), c, 0, NULL, NULL);
if (cl_ret != CL_SUCCESS) {
printf("clEnqueueReadBuffer failed, cl_ret=%d\n", cl_ret);
goto exit;
}
/* Show the result */
for(i = 0; i < DATA_SIZE; i++)
printf("%d + %d -> expected result: %d, opencl result: %d\n", a[i], b[i], a[i] + b [i], c[i]);
exit:
if (fp)
fclose(fp);
if (source_str)
free(source_str);
if (command_queue) {
clFlush(command_queue);
clFinish(command_queue);
clReleaseCommandQueue(command_queue);
}
if (kernel)
clReleaseKernel(kernel);
if (program)
clReleaseProgram(program);
if (mem_a)
clReleaseMemObject(mem_a);
if (mem_b)
clReleaseMemObject(mem_b);
if (mem_c)
clReleaseMemObject(mem_c);
if (context)
clReleaseContext(context);
return 0;
}
计算脚本:cl_vec3_add.cl
__kernel void vec3_add(__global const int *a, __global const int *b, __global int *c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}
编译:
/opt/A133/buildroot/buildroot-20200212-Qt-sunxi-C/output/host/bin/aarch64-linux-gnu-gcc -o /mnt/hgfs/D/opencl_test cl_vec3_add.c -I/opt/A133/tina4/package/libs/libgpu-opencl/ge8300/3rdparty/include/khronos/ -lOpenCL
执行结果:
# /usr/bin/opencl
1 + 4 -> expected result: 5, opencl result: 5
2 + 5 -> expected result: 7, opencl result: 7
3 + 6 -> expected result: 9, opencl result: 9
#
计算脚本改成乘法:cl_vec3_add.cl
__kernel void vec3_add(__global const int *a, __global const int *b, __global int *c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}
执行结果:
# /usr/bin/opencl
1 + 4 -> expected result: 5, opencl result: 4
2 + 5 -> expected result: 7, opencl result: 10
3 + 6 -> expected result: 9, opencl result: 18
#
离线
感谢分享。我用a133跑这个opencl_demo 计算正常。但是作为onnxruntime 的后端,跑不过onnxruntime test。失败原因是计算结果不一致
同时跑qt 使用gpu 做后端刷图时候会花屏,请问有遇到过么
离线
opencl 只会跑demo,不会onnxruntime
我这边 Qt跑OpenGL DEMO正常,没有花屏。
离线
opencl 只会跑demo,不会onnxruntime
我这边 Qt跑OpenGL DEMO正常,没有花屏。
opengl demo是格子吧。我看你好像移植了qt5. 试下跑
export QT_QPA_PLATFORM=eglfs && \
export QT_QPA_EGLFS_INTEGRATION=none && \
/usr/share/qt5/examples/opengl/hellogles3/hellogles3
离线
@微凉VeiLiang
https://www.bilibili.com/video/BV1fU4y1z7Rp/
https://www.bilibili.com/video/BV1LK411f7x9
跑这个 cellphone 等Qt demo都是正常的
离线
@微凉VeiLiang
https://www.bilibili.com/video/BV1fU4y1z7Rp/https://www.bilibili.com/video/BV1LK411f7x9
跑这个 cellphone 等Qt demo都是正常的
嗯呢,难道和我的分辨率有关。使用的是1920*1080分辨率。或者说内存速度。我也有小智的a133的开发板,方便分享个系统烧录镜像么,之前我测试在开发板上面也是跑OpenGL的demo有异常的
离线
离线
感谢,这边看到的是左右错位花屏来回跳。你这个解决方案我试下看看。
最近编辑记录 微凉VeiLiang (2024-08-26 16:17:19)
离线
memory 说:感谢,这边看到的是左右错位花屏来回跳。你这个解决方案我试下看看。
还是一样的,不知道是什么情况。跑手机的demo视频如下
离线
tplayer播放mp4文件是正常的。QT如果不使用到gpu也是正常的
离线
tplayer播放mp4文件是正常的。QT如果不使用到gpu也是正常的
我用的是这个开发板: https://item.taobao.com/item.htm?id=747735085973
cellphone测试固件:
tina_a133-dock2_uart0_buildroot-20200212-Qt-sunxi-C_202408252225.img.7z
离线
@memory
谢谢,我的是这个开发板。不过屏幕没有,我捣鼓捣鼓
离线
离线
我使用上面的板子和固件,找到了一个屏幕800*480的。发现开机后进入系统后没过几秒就会卡死。然后趁着这几秒如果kill掉 /usr/bin/cellphone 就不会卡死。经过处理掉开机自启动后,手动运行,也是会卡死花屏,这个花屏是死机那种花屏。如下视频所示,这个不知道是因为硬件差异还是什么,供电的话也加多了一个供电的usb线了。现象是一样的
离线
@微凉VeiLiang
我问了一下店主,说可能是tina4只适配了1G,2G内存的问题,
4G版本可能需要微调一下 sys_config.fex ddr 参数。
离线
@微凉VeiLiang
我问了一下店主,说可能是tina4只适配了1G,2G内存的问题,4G版本可能需要微调一下 sys_config.fex ddr 参数。
好的,感谢。我买的这个确实是4G版本
离线
把 DDR 频率从600M调低到360M 初步测试OK:
tina_a133-dock2_uart0_buildroot-20200212-Qt-sunxi-C_202408271120.img.7z
离线
@memory
感谢,这个固件在开发板上面可以正常跑了。我看看我的板子是不是也是内存频率或者配置的问题
离线
尝试了好几样操作都不行.....有点难搞
1.降低ddr频率,也降到了360Mhz,问题依旧
2.把开发板能够使用的的ko 两个替换,重启没啥报错,但是问题依旧
3.把开发板能够运行的 libgpu里面同样的so文件替换,软连接好,后编译出固件,问题依旧
4.修改几项设备树gpu相关参数和开发板的固件一样,依旧无果....
想问下能否单独分享一个kernel镜像和文件系统镜像。boot.img 和rootfs.img 。主要是论坛前面找了好几个解包软件都不好使,非常感谢
离线
gpu频率也调过几个版本,好像也是没有效果
离线
gpu频率也调过几个版本,好像也是没有效果
抱歉,Tina5上面我们没有测试过Qt OpenGL,试一试Tina4(需要自己行勾选Qt编译打包):
链接: https://pan.baidu.com/s/16aSGIsIlPa_K2euZVltRKA?pwd=A133
提取码:A133
离线
微凉VeiLiang 说:gpu频率也调过几个版本,好像也是没有效果
抱歉,Tina5上面我们没有测试过Qt OpenGL,试一试Tina4(需要自己行勾选Qt编译打包):
链接: https://pan.baidu.com/s/16aSGIsIlPa_K2euZVltRKA?pwd=A133
提取码:A133
好的,感谢,我下载看看
离线
@微凉VeiLiang
找到能够使用的解包工具了
imgRePacker_206.zip
离线
我在使用opencl做并行计算,发现GPU比CPU还慢,原因为内存瓶颈,还有一个,大佬有使用GPU刷屏的demo吗?正好需要一份
离线
不使用QT,使用lvgl刷屏
离线
换成4.0的sdk就好了.....1920*1080也不会花屏
离线