#include const int N = 20; // Here blockIdx.x identifies the current block. __global__ void add( int *a, int *b, int *c ) { int i = blockIdx.x; c[i] = a[i] + b[i]; } int main() { int *a, *b, *c; int *d_a, *d_b, *d_c; int i; a = new int[N]; b = new int[N]; c = new int[N]; for (int i = 0; i < N; i++) { a[i] = 1; b[i] = 2; } int size = sizeof(int) * N; cudaMalloc((void**) &d_a, size); cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaMalloc((void**) &d_b, size); cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice); cudaMalloc((void**) &d_c, size); cudaMemcpy(d_c, c, size, cudaMemcpyHostToDevice); add<<>>(d_a, d_b, d_c); cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost); for (i=0; i < N; i++) printf("%d ", c[i]); printf("\n"); cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); delete[] a; delete[] b; delete[] c; return 0; }