当前位置：首页 > news >正文

【cuda学习日记】2.2 使用2维网络（grid）和2维块（block）对矩阵进行求和

news 2026/4/10 11:12:06

在2.0中进行了用一维网格和块对一维向量进行了求和。
在2.1中例化了二维的网格和块。
接下来进行2维网络（grid）和2维块（block）对矩阵进行求和。

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <windows.h>typedef unsigned long DWORD;#define CHECK(call) \{\const cudaError_t error = call; \if (error != cudaSuccess)\{\printf("Error: %s: %d\n", __FILE__, __LINE__);\printf("code :%d reason :%s\n", error , cudaGetErrorString(error));\exit(1);\}\
}void checkResult(float *hostRef, float *gpuRef, const int N)
{double epsilon = 1.0E-8;bool match = 1;for (int i = 0; i < N; i++){if (abs(hostRef[i] - gpuRef[i])> epsilon){match = 0;printf("Array do not match\n");printf("host %5.2f gpu % 5.2f at current %d\n", hostRef[i], gpuRef[i], i);break;}}if (match) printf("array matches\n");
}void initialData(float *ip, int size)
{time_t t;srand((unsigned int) time(&t));for (int i = 0; i < size; i++) {ip[i] = (float) (rand() & 0xff) / 10.0f;}
}void sumMatrixOnHost(float *A, float *B, float *C, const int nx, const int ny){float *ia = A;float *ib = B;float *ic = C;for (int iy = 0; iy < ny; iy++){for (int ix =0; ix < nx; ix++){ic[ix] = ia[ix] + ib[ix];}ia += nx; ib += nx;ic += nx;}
}__global__ void sumMatrixOnGPU2D(float *MatA, float *MatB, float *MatC, int nx, int ny){unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;unsigned int iy = threadIdx.y + blockIdx.y * blockDim.y;unsigned int idx = iy*nx + ix;if (ix < nx && iy < ny){MatC[idx] = MatA[idx] + MatB[idx];}
}int main(int argc , char **argv)
{printf("%s starting\n", argv[0]);int dev = 0;cudaDeviceProp deviceprop;CHECK(cudaGetDeviceProperties(&deviceprop,dev));printf("Using Device %d : %s\n", dev, deviceprop.name);CHECK(cudaSetDevice(dev));//set up dataint nx  = 1<<14;int ny  = 1<<14;int nxy = nx * ny;size_t nBytes = nxy  * sizeof(float);printf("matrix size %d %d\n", nx, ny);float *h_A, *h_B, *hostRef, *gpuRef;h_A = (float *) malloc (nBytes);h_B = (float *) malloc (nBytes);hostRef = (float *) malloc (nBytes);gpuRef = (float *) malloc (nBytes);initialData(h_A, nxy);initialData(h_B, nxy);memset(hostRef,0, nBytes);memset(gpuRef,0, nBytes);// malloc device global memoryfloat *d_MatA, *d_MatB, *d_MatC;cudaMalloc((float**)&d_MatA, nBytes);cudaMalloc((float**)&d_MatB, nBytes);cudaMalloc((float**)&d_MatC, nBytes);//transfer data from host to devicecudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);int dimx = 32;int dimy = 32;dim3 block(dimx, dimy);dim3 grid((nx + block.x - 1)/block.x, (ny + block.y - 1)/block.y);cudaEvent_t start, stop;cudaEventCreate(&start);cudaEventCreate(&stop);cudaEventRecord(start);sumMatrixOnGPU2D<<<grid,block>>>(d_MatA, d_MatB, d_MatC, nx, ny);cudaDeviceSynchronize();cudaEventRecord(stop);cudaEventSynchronize(stop);float milliseconds = 0;cudaEventElapsedTime(&milliseconds, start, stop);printf("execution config <<<(%d,%d), (%d,%d)>>>\n", grid.x,grid.y, block.x, block.y);printf("Kernel execution time: %f ms\n", milliseconds);cudaEventDestroy(start);cudaEventDestroy(stop);//copy kernel result back to hostcudaMemcpy(gpuRef, d_MatC, nBytes, cudaMemcpyDeviceToHost);sumMatrixOnHost(h_A, h_B, hostRef, nx,ny);checkResult(hostRef, gpuRef, nxy);cudaFree(d_MatA);cudaFree(d_MatB);cudaFree(d_MatC);free(h_A);free(h_B);free(hostRef);free(gpuRef);return 0;
}

基本流程和1维向量求和类似
输出结果：
Using Device 0 : NVIDIA GeForce RTX 4090
matrix size 16384 16384
execution config <<<(512,512), (32,32)>>>
Kernel execution time: 5.351136 ms
array matches

block的尺寸为32x32。//block(dimx,dimy)定义的。
改变block尺寸为32x16:
execution config <<<(512,1024), (32,16)>>>
Kernel execution time: 3.778752 ms

进一步改变block尺寸为16x16:
execution config <<<(1024,1024), (16,16)>>>
Kernel execution time: 3.712736 ms

在之前尝试使用nvprof测试kernl性能时，report
======= Warning: nvprof is not supported on devices with compute capability 8.0 and higher.

参考 https://blog.csdn.net/TH_NUM/article/details/109952643 使用nsys
将C:\Program Files\NVIDIA Corporation\Nsight Systems 2024.5.1\target-windows-x64加入环境变量即可

nsys profile --stats=true .\sum_matrix_on_gpu_timer.exe

输出：

Collecting data...
Generating 'C:\Users\ADMINI~1\AppData\Local\Temp\nsys-report-ffa3.qdstrm'
[1/8] [========================100%] report2.nsys-rep
[2/8] [========================100%] report2.sqlite
[3/8] Executing 'nvtx_sum' stats report
SKIPPED: C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.sqlite does not contain NV Tools Extension (NVTX) data.
[4/8] Executing 'osrt_sum' stats report
SKIPPED: No data available.
[5/8] Executing 'cuda_api_sum' stats reportTime (%)  Total Time (ns)  Num Calls   Avg (ns)     Med (ns)   Min (ns)  Max (ns)   StdDev (ns)           Name--------  ---------------  ---------  -----------  ----------  --------  ---------  -----------  ----------------------93.3        321764988          3  107254996.0  91069908.0  83897570  146797510   34432084.1  cudaMemcpy4.0         13772507          3    4590835.7   4393180.0   3984976    5394351     725179.5  cudaFree1.5          5118078          3    1706026.0   1249576.0    819401    3049101    1182856.9  cudaMalloc1.0          3496955          1    3496955.0   3496955.0   3496955    3496955          0.0  cudaDeviceSynchronize0.1           459711          1     459711.0    459711.0    459711     459711          0.0  cudaLaunchKernel0.0            49593          2      24796.5     24796.5       707      48886      34067.7  cudaEventCreate0.0            22341          1      22341.0     22341.0     22341      22341          0.0  cuLibraryUnload0.0            18196          2       9098.0      9098.0      7920      10276       1665.9  cudaEventRecord0.0            15060          1      15060.0     15060.0     15060      15060          0.0  cudaEventSynchronize0.0             1961          1       1961.0      1961.0      1961       1961          0.0  cuCtxSynchronize0.0             1434          1       1434.0      1434.0      1434       1434          0.0  cuModuleGetLoadingMode0.0             1012          2        506.0       506.0       205        807        425.7  cudaEventDestroy      0.0              181          1        181.0       181.0       181        181          0.0  cuDeviceGetLuid[6/8] Executing 'cuda_gpu_kern_sum' stats reportTime (%)  Total Time (ns)  Instances  Avg (ns)   Med (ns)   Min (ns)  Max (ns)  StdDev (ns)                          Name--------  ---------------  ---------  ---------  ---------  --------  --------  -----------  -----------------------------------------------------100.0          3453326          1  3453326.0  3453326.0   3453326   3453326          0.0  sumMatrixOnGPU2D(float *, float *, float *, int, int)[7/8] Executing 'cuda_gpu_mem_time_sum' stats reportTime (%)  Total Time (ns)  Count   Avg (ns)    Med (ns)   Min (ns)  Max (ns)  StdDev (ns)           Operation--------  ---------------  -----  ----------  ----------  --------  --------  -----------  ----------------------------68.3        180949528      2  90474764.0  90474764.0  89939258  91010270     757319.8  [CUDA memcpy Host-to-Device]31.7         83834368      1  83834368.0  83834368.0  83834368  83834368          0.0  [CUDA memcpy Device-to-Host][8/8] Executing 'cuda_gpu_mem_size_sum' stats reportTotal (MB)  Count  Avg (MB)  Med (MB)  Min (MB)  Max (MB)  StdDev (MB)           Operation----------  -----  --------  --------  --------  --------  -----------  ----------------------------2147.484      2  1073.742  1073.742  1073.742  1073.742        0.000  [CUDA memcpy Host-to-Device]1073.742      1  1073.742  1073.742  1073.742  1073.742        0.000  [CUDA memcpy Device-to-Host]Generated:C:\Users\Administrator\Desktop\edward_temp\chapter2\report2.nsys-repC:\Users\Administrator\Desktop\edward_temp\chapter2\report2.sqlite

【cuda学习日记】2.2 使用2维网络（grid）和2维块（block）对矩阵进行求和

相关文章：

【cuda学习日记】2.2 使用2维网络（grid）和2维块（block）对矩阵进行求和

深度学习中CUDA环境安装教程

IDEA的常用设置

【VUE+ElementUI】通过接口下载blob流文件设置全局Loading加载进度

算法的五个重要特性和4个基本标准

svelte5中使用react组件

iOS - 自定义引用计数(MRC)

北航现实场景无人机VLN新基准! OpenUAV：面向真实环境的无人机视觉语言导航，平台、基准与方法

OpenCV计算机视觉 08 图像的旋转

C++感受15-Hello STL 泛型启蒙

【Java 学习】对象赋值的艺术：Java中clone方法的浅拷贝与深拷贝解析，教你如何在Java中实现完美复制

基于高斯混合模型的数据分析及其延伸应用（具体代码分析）

无人机+Ai应用场景！

操作手册：集成钉钉审批实例消息监听配置

AI大模型-提示工程学习笔记4

Vue3.5 企业级管理系统实战（一）：项目初始搭建与配置

缓存-Redis-缓存更新策略-主动更新策略-Cache Aside Pattern（全面易理解）

杭州市有哪些大学能够出具论文检索报告？

SpringBootWeb 登录认证（day12）

使用AOP在切面逻辑中无法获取到requesetBody

Chord视频理解工具开源可部署：支持国产昇腾/寒武纪平台适配路线

Zotero-SciPDF：3分钟解锁科研超能力，告别文献下载烦恼

当你的 Agent 会“多轮思考”，Trace 却还停留在单轮：阿里云 CMS OpenClaw 可观测插件升级

HunyuanVideo-Foley 系统资源监控与清理：解决C盘空间不足的实战技巧

StructBERT中文匹配系统效果展示：多轮对话上下文语义一致性分析

Qwen3.5-9B多场景落地：开发者写Python脚本、产品经理写PRD、教师出题批改

# 发散创新：基于WebHID的浏览器端硬件交互实战指南在现代Web开发中，越来越多的应用场

加入csdn 5周年

写程序相册内页分隔卡，复古做旧风，输出:纪念册/影楼增值项目。

突破内容访问限制：从原理到实践的完整指南