1500字范文 > GPU编程 CUDA C++ 分子动力学模拟【GPU加速版】迷你代码

GPU编程 CUDA C++ 分子动力学模拟【GPU加速版】迷你代码

时间：2021-02-02 12:35:23

分子动力学模拟对一个具有一定初始条件和边界条件且具有相互作用（分子力场molecular force feild）的多粒子系统的运动方程进行数值积分，得到系统在相空间（phase space）中的一条离散的轨迹（trajectory），并用统计力学的方法从这条相轨迹中提取出有用的物理结果（例如：温度、压强、自由能、构像转变机理）。

N粒子系统（3N个坐标，3N个动量构成一个6N维相空间）

初始化：

坐标初始化（使用固态氩晶体结构加上超近原子距离预警）氩晶体为单原子且只有LJ势

速度初始化（使用随机速度分布并加上0总动量约束和温度缩放校正，不用Maxwell分布）

边界条件：使用正交盒子加最小镜像约定

相互作用的LJ势：势能表达式，力表达式，两体势的xyz分解，力的xyz分解

数值积分：采用“速度-Verlet”积分方法

原子单位制：四个基本量：电子伏特(eV)、埃(A)、原子单位质量(amu)、温度(K)

推导单位制：力(eV*A^-1)，速度(eV^1/2)*(amu^-1/2)，时间(A*amu^1/2)*(eV^-1/2)飞秒时间步长，波尔兹曼常数 KB=8.617343x10^-5 eV*(K^-1)

编译与运行：

$ make# 双精度版需要在Makefile文件中的CFLAGS选项后加上 -DUSE_DP$ ./ljmd nx Ne # 体系晶胞原子数量为 nx*nx*nx*4 ，Ne 代表平衡过程的步数

程序架构：

主函数（main.cu）

--- 常量、自定义结构体、浮点数类型（common.h）

--- 内存分配与释放（memory.cuh 和 memory.cu）

--- 模拟系统的初始化（initialize.h 和initialize.cu）

--- 构建邻居列表（neighbor.h 和 neighbor.cu）

--------- 使用最小镜像（mic.h）

--- 运动方程的积分（integrate.h 和 integrate.cu）【在GPU中计算】

--------- 求力和势能（force.h 和 force.cu）【在GPU中计算】

--------------- 使用最小镜像（mic.h）

main.cu：

#include "common.h" //单双精度，波尔兹曼常量，时间校正常量，Atom结构体，Box结构体#include "memory.h" //内存分配与释放#include "initialize.h" //坐标与速度初始化#include "neighbor.h"//GPU计算各原子的邻居列表#include "integrate.h" //GPU积分引擎#include <stdlib.h>#include <stdio.h>int main(int argc, char **argv){int nx = 5;//晶胞数量int Ne = 2000; //平衡步数int Np = 2000; //生产步数if (argc != 3) //必须以如下格式的命令运行： $ ./ljmd nx Ne{ printf("Usage: %s nx Ne\n", argv[0]);exit(1);}else{nx = atoi(argv[1]);Ne = atoi(argv[2]);Np = Ne; //默认生产步数和平衡步数一样多，也可以自己修改源码重新编译自己想要的步数}int N = 4 * nx * nx * nx; //总的体系原子数int Ns = 100; //生产时轨迹写入间隔步数int MN = 200; //Max Neighbor 最多邻居数real T_0 = 60.0; //目标温度Kreal ax = 5.385; //初始相邻两原子间距Areal time_step = 5.0 / TIME_UNIT_CONVERSION; //5飞秒时间步长转换为自然单位时间Atom atom;allocate_memory(N, MN, &atom); //memory.cu中一个自定义内存分配函数，分配给atom对象各参数内存for (int n = 0; n < N; ++n) { atom.m[n] = 40.0; } //原子质量initialize_position(nx, ax, &atom); //位置初始化initialize_velocity(N, T_0, &atom); //速度初始化find_neighbor(N, MN, &atom);//计算邻居列表equilibration(Ne, N, MN, T_0, time_step, &atom); //平衡过程production(Np, Ns, N, MN, T_0, time_step, &atom); //生产过程deallocate_memory(&atom); //memory.cu中一个自定义内存释放函数，释放atom对象各参数内存return 0;}

common.h：

#pragma once#ifdef DOUBLE_PRECISIONtypedef double real; //使用双精度浮点数编译#elsetypedef float real;//使用单精度浮点数编译（默认）#endif#define K_B 8.617343e-5 //波尔兹曼常量#define TIME_UNIT_CONVERSION 1.018051e+1 //原子单位制时间 = 1fs/1.018051e+1struct Atom //关键数据存储两遍，两个指针，CPU内存中一遍，GPU显存中一遍{real *m;real *x;real *y;real *z;real *vx;real *vy;real *vz;real *fx;real *fy;real *fz;real *pe;real *ke;real *box;int *g_NN; //Neighbor Numberint *g_NL; //Neighbor Listreal *g_m; //GPU中存储的质量real *g_x;real *g_y;real *g_z;real *g_vx;real *g_vy;real *g_vz;real *g_fx;real *g_fy;real *g_fz;real *g_pe; //GPU中存储的动能real *g_ke; //GPU中存储的势能};struct Box{real lx; //盒子x方向长度real ly;real lz;real lx2; //盒子x方向长度的一半，为了方便mic变换real ly2;real lz2;};

memory.cu：

#include "error.cuh"#include "memory.h"#include <stdlib.h>void allocate_memory(int N, int MN, Atom *atom) //自定义内存分配函数{//分配CPU主存atom->m = (real*) malloc(N * sizeof(real)); //原子质量列表atom->x = (real*) malloc(N * sizeof(real));atom->y = (real*) malloc(N * sizeof(real));atom->z = (real*) malloc(N * sizeof(real));atom->vx = (real*) malloc(N * sizeof(real));atom->vy = (real*) malloc(N * sizeof(real));atom->vz = (real*) malloc(N * sizeof(real));atom->fx = (real*) malloc(N * sizeof(real));atom->fy = (real*) malloc(N * sizeof(real));atom->fz = (real*) malloc(N * sizeof(real));atom->pe = (real*) malloc(N * sizeof(real));atom->ke = (real*) malloc(N * sizeof(real));atom->box = (real*) malloc(6 * sizeof(real)); //盒子大小 x, y, z, x/2, y/2, z/2//分配GPU显存 CHECK(cudaMalloc((void**)&atom->g_NN, sizeof(int) * N)); //Neighbor NumberCHECK(cudaMalloc((void**)&atom->g_NL, sizeof(int) * N * MN)); //Neighbor ListCHECK(cudaMalloc((void**)&atom->g_m, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_x, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_y, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_z, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_vx, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_vy, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_vz, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_fx, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_fy, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_fz, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_pe, sizeof(real) * N));CHECK(cudaMalloc((void**)&atom->g_ke, sizeof(real) * N));}void deallocate_memory(Atom *atom)//自定义内存释放函数{free(atom->m);free(atom->x);free(atom->y);free(atom->z);free(atom->vx);free(atom->vy);free(atom->vz);free(atom->fx);free(atom->fy);free(atom->fz);free(atom->pe);free(atom->ke);free(atom->box);CHECK(cudaFree(atom->g_NN));CHECK(cudaFree(atom->g_NL));CHECK(cudaFree(atom->g_m));CHECK(cudaFree(atom->g_x));CHECK(cudaFree(atom->g_y));CHECK(cudaFree(atom->g_z));CHECK(cudaFree(atom->g_vx));CHECK(cudaFree(atom->g_vy));CHECK(cudaFree(atom->g_vz));CHECK(cudaFree(atom->g_fx));CHECK(cudaFree(atom->g_fy));CHECK(cudaFree(atom->g_fz));CHECK(cudaFree(atom->g_pe));CHECK(cudaFree(atom->g_ke));}

memory.h：

#pragma once#include "common.h"void allocate_memory(int N, int MN, Atom *atom);void deallocate_memory(Atom *atom);

initialize.cu：

#include "initialize.h"#include "error.cuh"#include <stdlib.h>#include <math.h>static void scale_velocity(int N, real T_0, Atom *atom) //校正温度到目标T_0温度{real *m = atom->m;real *vx = atom->vx;real *vy = atom->vy;real *vz = atom->vz;real temperature = 0.0;for (int n = 0; n < N; ++n) {real v2 = vx[n]*vx[n] + vy[n]*vy[n] + vz[n]*vz[n];temperature += m[n] * v2; }temperature /= 3.0 * K_B * N; //根据“能均分定理“确定当前速度下体系的温度real scale_factor = sqrt(T_0 / temperature); //确定校正到T_0温度所需的乘子for (int n = 0; n < N; ++n){ vx[n] *= scale_factor;vy[n] *= scale_factor;vz[n] *= scale_factor;}}void initialize_position(int nx, real ax, Atom *atom) //初始化坐标{atom->box[0] = ax * nx;//盒子在x轴方向上的长度atom->box[1] = ax * nx;atom->box[2] = ax * nx;atom->box[3] = atom->box[0] * 0.5; //盒子在x轴方向上一半的长度，方便用于mic变换atom->box[4] = atom->box[1] * 0.5;atom->box[5] = atom->box[2] * 0.5;real *x = atom->x;real *y = atom->y;real *z = atom->z;//以某点(ix, iy, iz)为中心，射出如下4个向量：real x0[4] = {0.0, 0.0, 0.5, 0.5}; // (0.0, 0.0, 0.0), (0.0, 0.5, 0.5), real y0[4] = {0.0, 0.5, 0.0, 0.5}; // (0.5, 0.0, 0.5), (0.5, 0.5, 0.0)real z0[4] = {0.0, 0.5, 0.5, 0.0}; //以生成4个原子，最后再平移中心点(ix, iy, iz)进行晶胞复制int n = 0;for (int ix = 0; ix < nx; ++ix) //nx是晶胞中原子个数{for (int iy = 0; iy < nx; ++iy){for (int iz = 0; iz < nx; ++iz){for (int i = 0; i < 4; ++i){x[n] = (ix + x0[i]) * ax;y[n] = (iy + y0[i]) * ax;z[n] = (iz + z0[i]) * ax;n++;}}}}int m1 = sizeof(real) * 4 * nx * nx * nx; //内存大小CHECK(cudaMemcpy(atom->g_x, atom->x, m1, cudaMemcpyHostToDevice)); //用法cudaMemcpy(device_array, host_array, size, cudaMemcpyHostToDevice)CHECK(cudaMemcpy(atom->g_y, atom->y, m1, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(atom->g_z, atom->z, m1, cudaMemcpyHostToDevice));}void initialize_velocity(int N, real T_0, Atom *atom) //分配初始速度{real *m = atom->m;real *vx = atom->vx;real *vy = atom->vy;real *vz = atom->vz;real momentum_average[3] = {0.0, 0.0, 0.0};for (int n = 0; n < N; ++n){ vx[n] = -1.0 + (rand() * 2.0) / RAND_MAX; //调用该函数会返回一个介于0 ~ RAND_MAX（可以通过stdlib.h中的宏定义来获取）vy[n] = -1.0 + (rand() * 2.0) / RAND_MAX; // -1.0 是为了把均值调为0vz[n] = -1.0 + (rand() * 2.0) / RAND_MAX; momentum_average[0] += m[n] * vx[n] / N;//动量平均momentum_average[1] += m[n] * vy[n] / N;momentum_average[2] += m[n] * vz[n] / N;} for (int n = 0; n < N; ++n) //消除xyz各方向上的总平动能{ vx[n] -= momentum_average[0] / m[n];//vx[n]是第n个原子的x方向的速度vy[n] -= momentum_average[1] / m[n];vz[n] -= momentum_average[2] / m[n]; }scale_velocity(N, T_0, atom); //调用之前写的缩放函数，对当前速度分布进行缩放CHECK(cudaMemcpy(atom->g_m, atom->m, sizeof(real) * N, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(atom->g_vx, atom->vx, sizeof(real) * N, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(atom->g_vy, atom->vy, sizeof(real) * N, cudaMemcpyHostToDevice));CHECK(cudaMemcpy(atom->g_vz, atom->vz, sizeof(real) * N, cudaMemcpyHostToDevice));}

initialize.h：

#pragma once#include "common.h"void initialize_position(int nx, real ax, Atom *atom);void initialize_velocity(int N, real T_0, Atom *atom);

neighbor.cu：

#include "neighbor.h"#include "mic.h"#include <stdio.h>#include <stdlib.h>static void __global__ gpu_find_neighbor //静态GPU内联核函数(int N, int MN, int *g_NN, int *g_NL, Box box, real *g_x, real *g_y, real *g_z, real cutoff2){int n1 = blockIdx.x * blockDim.x + threadIdx.x;if (n1 < N){int count = 0;real x1 = g_x[n1];real y1 = g_y[n1];real z1 = g_z[n1];for (int n2 = 0; n2 < N; n2++){real x12 = g_x[n2] - x1;real y12 = g_y[n2] - y1;real z12 = g_z[n2] - z1;apply_mic(box, &x12, &y12, &z12); //盒子box是6元数组real d12_square = x12*x12 + y12*y12 + z12*z12;if ((n2 != n1) && (d12_square < cutoff2)){g_NL[count++ * N + n1] = n2; // g_NL[]生成邻居列表 Neighbor List}}g_NN[n1] = count; // g_NN[] Neighbor number 各原子邻居数}}void find_neighbor(int N, int MN, Atom *atom) //包装函数{real cutoff = 11.0;real cutoff2 = cutoff * cutoff;Box box;//盒子有6个元素box.lx = atom->box[0];//盒子x方向的长度box.ly = atom->box[1];box.lz = atom->box[2];box.lx2 = atom->box[3];//盒子x轴方向上一半的长度，方便用于mic变换box.ly2 = atom->box[4];box.lz2 = atom->box[5];int block_size = 128;// RTX 3080Ti 是 1024 1024 64int grid_size = (N - 1) / block_size + 1; //向上取整gpu_find_neighbor<<<grid_size, block_size>>> //调用GPU核函数(N, MN, atom->g_NN, atom->g_NL, box,atom->g_x, atom->g_y, atom->g_z, cutoff2);}

neighbor.h：

#pragma once#include "common.h"void find_neighbor(int N, int MN, Atom *atom);

mic.h：

#pragma oncestatic void __device__ apply_mic // 静态内联函数（高性能），进行最小镜像变换(Box box, real *x12, real *y12, real *z12){if(*x12 < - box.lx2) { *x12 += box.lx; }//box.lx2代表盒子在x轴方向上一半的长度else if (*x12 > + box.lx2) { *x12 -= box.lx; }if(*y12 < - box.ly2) { *y12 += box.ly; }//box.ly2代表盒子在y轴方向上一半的长度else if (*y12 > + box.ly2) { *y12 -= box.ly; }if(*z12 < - box.lz2) { *z12 += box.lz; }//box.lz2代表盒子在z轴方向上一半的长度else if (*z12 > + box.lz2) { *z12 -= box.lz; }}

integrate.cu：

#include "integrate.h"#include "error.cuh"#include "force.h"#include "reduce.h"#include <stdio.h>#include <math.h>#include <time.h>static void __global__ gpu_scale_velocity // GPU核函数：速度缩放(int N, real scale_factor, real *g_vx, real *g_vy, real *g_vz){int n = blockDim.x * blockIdx.x + threadIdx.x;if (n < N){ g_vx[n] *= scale_factor;g_vy[n] *= scale_factor;g_vz[n] *= scale_factor;}}static void scale_velocity(int N, real T_0, Atom *atom) //包装函数{real temperature = sum(N, atom->g_ke) / (1.5 * K_B * N); //通过当前动能计算当前温度real scale_factor = sqrt(T_0 / temperature); //和目标温度T_0之间的缩放因子int block_size = 128;int grid_size = (N - 1) / block_size + 1;gpu_scale_velocity<<<grid_size, block_size>>>// 调用核函数(N, scale_factor, atom->g_vx, atom->g_vy, atom->g_vz);}static void __global__ gpu_integrate(int N, real time_step, real time_step_half,real *g_m, real *g_x, real *g_y, real *g_z,real *g_vx, real *g_vy, real *g_vz,real *g_fx, real *g_fy, real *g_fz, real *g_ke, int flag){int n = blockDim.x * blockIdx.x + threadIdx.x;if (n < N){real mass = g_m[n];real mass_inv = 1.0 / mass; //使用乘法比除法计算效率高real ax = g_fx[n] * mass_inv;//加速度real ay = g_fy[n] * mass_inv;real az = g_fz[n] * mass_inv;real vx = g_vx[n];real vy = g_vy[n];real vz = g_vz[n];vx += ax * time_step_half; //更新v(t) --> v(t+dt/2) 或者 v(t+dt/2) --> v(t+dt)vy += ay * time_step_half;vz += az * time_step_half;g_vx[n] = vx;g_vy[n] = vy;g_vz[n] = vz;if (flag == 1)//上半轮循环计算 v(t+dt/2), r(t+dt), F(t+dt){ g_x[n] += vx * time_step; g_y[n] += vy * time_step; g_z[n] += vz * time_step; }else//下半轮循环计算 v(t+dt) 和动能{g_ke[n] = (vx*vx + vy*vy + vz*vz) * mass * 0.5;}}}static void integrate(int N, real time_step, Atom *atom, int flag) //包装函数{real time_step_half = time_step * 0.5;int block_size = 128;int grid_size = (N - 1) / block_size + 1;gpu_integrate<<<grid_size, block_size>>> //调用核函数(N, time_step, time_step_half,atom->g_m, atom->g_x, atom->g_y, atom->g_z,atom->g_vx, atom->g_vy, atom->g_vz,atom->g_fx, atom->g_fy, atom->g_fz, atom->g_ke, flag);}void equilibration(int Ne, int N, int MN, real T_0, real time_step, Atom *atom){find_force(N, MN, atom);for (int step = 0; step < Ne; ++step){ integrate(N, time_step, atom, 1); //模式1，上半轮积分，v(t+dt/2)，r(t+dt)find_force(N, MN, atom); //调用函数，计算力, F(t+dt)integrate(N, time_step, atom, 2); //模式2，下半轮积分，v(t+dt)，总动能scale_velocity(N, T_0, atom); //热交换控温，缩放速度} }void production(int Np, int Ns, int N, int MN, real T_0, real time_step, Atom *atom){float t_force = 0.0f;CHECK(cudaDeviceSynchronize());clock_t t_total_start = clock(); //总计时开始FILE *fid_e = fopen("energy.txt", "w");//产生一个写模式文件流句柄fidfor (int step = 0; step < Np; ++step){ integrate(N, time_step, atom, 1); //模式1，上半轮积分，v(t+dt/2)，r(t+dt)CHECK(cudaDeviceSynchronize());clock_t t_force_start = clock(); //力计算部分计时开始find_force(N, MN, atom); //调用函数，计算力, F(t+dt)CHECK(cudaDeviceSynchronize());clock_t t_force_stop = clock();//力计算部分计时结束t_force += float(t_force_stop - t_force_start) / CLOCKS_PER_SEC; //力计时差integrate(N, time_step, atom, 2); //模式2，下半轮积分，v(t+dt)，总动能if (0 == step % Ns) //Ns：文件写入间隔{fprintf(fid_e, "%g %g\n", sum(N, atom->g_ke), sum(N, atom->g_pe)); //写入总动能，总势能}}fclose(fid_e); //关闭文件流clock_t t_total_stop = clock(); //总计时结束float t_total = float(t_total_stop - t_total_start) / CLOCKS_PER_SEC; //总计时差printf("Time used for production = %g s\n", t_total);printf("Time used for force part = %g s\n", t_force);}

integrate.h：

#pragma once#include "common.h"void equilibration(int Ne, int N, int MN, real T_0, real time_step, Atom *atom);void production(int Np, int Ns, int N, int MN, real T_0, real time_step, Atom *atom);

force.cu：

#include "error.cuh"#include "force.h"#include "mic.h"struct LJ // LJ 势结构体，储存一些常量，避免反复计算{real cutoff2;real e24s6; real e48s12;real e4s6;real e4s12;};static void __global__ gpu_find_force(LJ lj, int N, int *g_NN, int *g_NL, Box box,real *g_x, real *g_y, real *g_z,real *g_fx, real *g_fy, real *g_fz, real *g_pe){int i = blockIdx.x * blockDim.x + threadIdx.x; //将GPU线程一维序列化if (i < N) //一个原子i开一个线程{real fx = 0.0;real fy = 0.0;real fz = 0.0;real potential = 0.0;int NN = g_NN[i];real x_i = g_x[i];real y_i = g_y[i];real z_i = g_z[i];for (int k = 0; k < NN; ++k) //（单线程内）逃脱不了的对第二个原子j的循环{ int j = g_NL[i + N * k];real x_ij = g_x[j] - x_i;real y_ij = g_y[j] - y_i;real z_ij = g_z[j] - z_i;apply_mic(box, &x_ij, &y_ij, &z_ij); //需要box信息real r2 = x_ij*x_ij + y_ij*y_ij + z_ij*z_ij;if (r2 > lj.cutoff2) { continue; }real r2inv = 1.0 / r2;real r4inv = r2inv * r2inv; //用乘法代替除法，提高计算效率real r6inv = r2inv * r4inv;real r8inv = r4inv * r4inv;real r12inv = r4inv * r8inv;real r14inv = r6inv * r8inv;real f_ij = lj.e24s6 * r8inv - lj.e48s12 * r14inv;//ij之间的力potential += lj.e4s12 * r12inv - lj.e4s6 * r6inv;//ij之间的势fx += f_ij * x_ij; //i原子对j求和，LJ势受力的xyz方向分解fy += f_ij * y_ij;fz += f_ij * z_ij;}g_fx[i] = fx;// i原子x方向上的合力g_fy[i] = fy; g_fz[i] = fz; g_pe[i] = potential * 0.5; // i原子 LJ势}}void find_force(int N, int MN, Atom *atom) //包装函数{const real epsilon = 1.032e-2;//计算存储常量const real sigma = 3.405;const real cutoff = 10.0;const real cutoff2 = cutoff * cutoff;const real sigma_3 = sigma * sigma * sigma;const real sigma_6 = sigma_3 * sigma_3;const real sigma_12 = sigma_6 * sigma_6;const real e24s6 = 24.0 * epsilon * sigma_6; const real e48s12 = 48.0 * epsilon * sigma_12;const real e4s6 = 4.0 * epsilon * sigma_6;const real e4s12 = 4.0 * epsilon * sigma_12;LJ lj; //给LJ结构体赋值lj.cutoff2 = cutoff2;lj.e24s6 = e24s6;lj.e48s12 = e48s12;lj.e4s6 = e4s6;lj.e4s12 = e4s12;Box box;box.lx = atom->box[0];box.ly = atom->box[1];box.lz = atom->box[2];box.lx2 = atom->box[3];box.ly2 = atom->box[4];box.lz2 = atom->box[5];int block_size = 128;int grid_size = (N - 1) / block_size + 1;gpu_find_force<<<grid_size, block_size>>> //调用GPU的核函数(lj, N, atom->g_NN, atom->g_NL, box,atom->g_x, atom->g_y, atom->g_z,atom->g_fx, atom->g_fy, atom->g_fz, atom->g_pe);}

force.h：

#pragma once#include "common.h"void find_force(int N, int MN, Atom *atom);

reduce.cu：

#include "reduce.h"#include "error.cuh"#include <cooperative_groups.h> //使用线程协作组提高性能using namespace cooperative_groups;const int block_size = 128;void __global__ reduce_cp(const real *d_x, real *d_y, const int N)//高性能数组归约求和函数{const int tid = threadIdx.x;const int bid = blockIdx.x;extern __shared__ real s_y[];real y = 0.0;const int stride = blockDim.x * gridDim.x;for (int n = bid * blockDim.x + tid; n < N; n += stride){y += d_x[n];}s_y[tid] = y;__syncthreads();for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1){if (tid < offset){s_y[tid] += s_y[tid + offset];}__syncthreads();}y = s_y[tid];thread_block_tile<32> g = tiled_partition<32>(this_thread_block());for (int i = g.size() >> 1; i > 0; i >>= 1){y += g.shfl_down(y, i);}if (tid == 0){d_y[bid] = y;}}__device__ real static_y[block_size];real sum(const int N, const real *d_x)//包装后的求和函数sum{real *d_y;CHECK(cudaGetSymbolAddress((void**)&d_y, static_y));const int smem = sizeof(real) * block_size;reduce_cp<<<block_size, block_size, smem>>>(d_x, d_y, N);reduce_cp<<<1, block_size, smem>>>(d_y, d_y, block_size);real h_y[1] = {0};CHECK(cudaMemcpy(h_y, d_y, sizeof(real), cudaMemcpyDeviceToHost));return h_y[0];}

reduce.h：

#include "common.h"real sum(const int N, const real *d_x);

error.cuh：

#pragma once //错误检查宏函数#include <stdio.h>#define CHECK(call)\do \{ \const cudaError_t error_code = call; \if (error_code != cudaSuccess)\{ \printf("CUDA Error:\n"); \printf(" File: %s\n", __FILE__);\printf(" Line: %d\n", __LINE__);\printf(" Error code: %d\n", error_code); \printf(" Error text: %s\n",\cudaGetErrorString(error_code));\exit(1); \} \} while (0)

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。