#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include
#include
//kernel函数代码
__global__ void testkernel(int *d_A, size_t size)
{
//x,y方向的线程索引
int dx = blockDim.x * blockIdx.x + threadIdx.x;
int dy = blockDim.y * blockIdx.y + threadIdx.y;
if( blockIdx.x == 0 && blockIdx.y == 0 )//0号block中的每个threads执行+1操作
d_A[dx*size+dy] += 1;
if( blockIdx.x == 0 && blockIdx.y == 1 )//1号block中的每个threads执行+1操作
d_A[dx*size+dy] += 2;
if( blockIdx.x == 1 && blockIdx.y == 0 )//2号block中的每个threads执行+1操作
d_A[dx*size+dy] += 3;
if( blockIdx.x == 1 && blockIdx.y == 1 )//3号block中的每个threads执行+1操作
d_A[dx*size+dy] += 4;
}
int main( int argc, char** argv)
{
int h_A[8][8] = {{1,1,1,1,2,2,2,2},//*内存中bolcks分配说明*
{1,1,1,1,2,2,2,2},//
{1,1,1,1,2,2,2,2},//0号bolcks 2号blocks
{1,1,1,1,2,2,2,2},//
{3,3,3,3,4,4,4,4},//
{3,3,3,3,4,4,4,4},//1号bolcks 3号bolcks
{3,3,3,3,4,4,4,4},//
{3,3,3,3,4,4,4,4}};//********************
int *d_A, *h_B;
size_t size = 8 * 8 * sizeof(int);//内存大小
size_t rsize = 8;
//配置grid与blockdim3是一个结构体(struct uint3{int x,int y,int z}dim3;)
dim3 dimgrid(2,2);//每个grid中2*2个block
dim3 dimblock(4,4);//每个bolck中4*4个threads
h_B = (int*)malloc(size);
//在设备上开辟内存空间
cudaMalloc( (void **) &d_A, size );
//把数据从主机拷到设备上
cudaMemcpy( d_A, h_A, size, cudaMemcpyHostToDevice );
//调用kernel函数、尖括号里面是对线程的分配dimgrid*dimblock
testkernel<<>>(d_A,rsize);
//把计算结果从设备gpu拷回主机cpu
cudaMemcpy( h_B, d_A, size, cudaMemcpyDeviceToHost );
//输出gpu计算的结果
for(int i = 0; i < 8; i++)
{
for(int j = 0;j < 8; j++)
printf("%2d ",h_B[i*rsize+j]);
printf("\n");
}
//释放指针
cudaFree(d_A);
free(h_B);
}
阅读(343) | 评论(0) | 转发(0) |