#include const int Blocks = 5; __global__ void kernel( int * a ) { int i = blockIdx.x; a[i] = i; } int main( void ) { int a[Blocks], *d_a; int size = sizeof(int) * Blocks; // Allocate d_a on the device cudaMalloc((void**) &d_a, size); // Fill d_a with the id numbers kernel<<>>(d_a); // Copy d_a onto a on the host cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost); // Output the results for (int i = 0; i < Blocks; i++) printf( "Hello world from %d!\n", a[i]); return 0; }