cudaのテスト

#include <stdio.h>
#include <stdlib.h>
#include "cuda.h"

#define N (16)

__global__ void saxpy(float* x, float* y, float a)
{
  int i;
  for(i=0;i<N;i++)
    y[i] = a*x[i] + y[i];
  return;
}

int main(int argc, char *argv[])
{
  float a = 1.5f;
  float *x, *y;
  float xcpu[N];
  float ycpu[N];
  int i;
  for(i=0;i<N;i++){
    xcpu[i] = 1;
    ycpu[i] = 1;
  }
  for(i=0;i<N;i++)
    printf("%.4f\n",ycpu[i]);

  cudaMalloc(&x, sizeof(float)*N);
  cudaMalloc(&y, sizeof(float)*N);
  cudaMemcpy(x,xcpu,sizeof(float)*N,cudaMemcpyHostToDevice);
  cudaMemcpy(y,ycpu,sizeof(float)*N,cudaMemcpyHostToDevice);


  saxpy<<<1,1>>>(x,y,a);

  cudaMemcpy(xcpu,x,sizeof(float)*N,cudaMemcpyDeviceToHost);
  cudaMemcpy(ycpu,y,sizeof(float)*N,cudaMemcpyDeviceToHost);

  for(i=0;i<N;i++)
    printf("%.4f\n",ycpu[i]);

  return 0;
}