CUDA编程函数声明
CUDA核函数(kernels)在N个不同的CUDA线程上并行执行 //定义kernel _global_ void VecAdd(float* A, float* B, float* C) {int i = threadIdx.x; C[i] = A[i] B[i]; } int main() {//... //在N个不同的CUDA线程上并行执行 VecAdd<<<1, N>>>(A, B, C); } 线程层次(Thread Hierarchies)
//单线程块 //定义kernel _global_ void MatAdd(float A[N][N], float B[N][N], float C[N][N]) {int i = threadIdx.x; int j = threadIdx.y; C[i][j] = A[i][j] B[i][j]; } int main() {//... //在N*N*1个不同的CUDA线程上并行执行 int numBlocks = 1; dim3 threadsPerBlock(N, N); VecAdd<<<numBlocks, threadsPerBlock>>>(A, B, C); } //多线程块 //定义kernel _global_ void MatAdd(float A[N][N], float B[N][N], float C[N][N]) {int i = threadIdx.x * blockDim.x threadIdx.x; int j = threadIdx.y * blockDim.y threadIdx.y; if(i<N && j<N) C[i][j] = A[i][j] B[i][j]; } int main() {//... //并行执行 dim3 threadsPerBlock(16, 16); dim3 numBlocks(N/threadsPerBlock.x, N/threadsPerBlock.y); VecAdd<<<numBlocks, threadsPerBlock>>>(A, B, C); } /* N = 32 i = [0,1] * 16 [0,15] */ CUDA内存传输主机端可以从设备端往返传输数据 Global memory 全局存储器 Constant memory 常量存储器
float *Md;//指向设备端上的一个存储空间 int size = Width * Width * sizeof(float); cudaMalloc((void**)&Md, size); //... cudaFree(Md);
cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice); cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost); //参数:目的地址 源地址 大小 传输方向
例子:矩阵相乘//CPU实现 void MatrixMulOnHost(float* M, float* N, float* P, int width) {for(int i=0; i<width; i) for(int j=0; j<width; j) {float sum = 0; for(int k=0; k<width; k) {float a = M[i * width k]; float b = N[k * width j]; sum = a*b; } p[i * width j] = sum; } } //cuda算法框架(3布) int main(void) {//1.管理整个内存,为数据分配空间,将数据拷贝到GPU上 //2.在GPU上并行处理计算 //3.将结果拷贝回CPU } //GPU实现 void MatrixMulOnDevice(float* M, float* N, float* P, int Width) {int size = Width * Width * sizeof(float); //1.管理整个内存,为数据分配空间,将数据拷贝到GPU上 //分配输入 cudaMalloc(Md, size); cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice); cudaMalloc(Nd, size); cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice); cudaMalloc(Pd, size); //2.在GPU上并行处理计算 _global_ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width) {//访问一个matrix,采用二维block int tx = threadIdx.x; int ty = threadIdx.y; //每个kernel线程计算一个输出 float Pvalue = 0; //计算 for(int k=0; k<Width; k) {float Mdelement = Md[ty*Md.width k]; float Ndelement = Nd[k*Nd.width tx]; Pvalue = Mdelement Ndelement; } Pd[ty*Width tx] = Pvalue; } //3.将结果拷贝回CPU //1个block含width*width个线程 dim3 dimBlock(WIDTH, WIDTH); dim3 dimGrid(1, 1); MatrixMulKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd); //传送数据 cudaMemcpy(Pd, P, size, cudaMemcpyDeviceToHost); //释放 cudaFree(Md); cudaFree(Nd); cudaFree(Pd); } 主要性能问题:访存 来源:https://www./content-1-820301.html |
|