//GPU的内存空间和CPU的内存的空间是分离的 需要将数据来回转移 这样看来 GPU对整个主机而言 就是一个外设 在OS中也确实是这么实现的 可以用lscpi查找GPU设备
cudaMemcpy(d_a, a, numBytes, cudaMemcpyHostToDevice); // Synchronous function, no overlapping allowed.
kernelFunctionName<<<Grid,ThreadsPerBlock>>>(arg1, arg2, ..., arg n); // Asynchronous, we can take advantage of overlapping.
//Host code during device execution
...
// End of the overlapped host code. Waiting for the end of the kernel execution to transfer data between the host and the device.
cudaMemcpy(a, d_a, numBytes, cudaMemcpyDeviceToHost); // Synchronous function, no overlapping allowed.