#include <stdio.h> #include <stdlib.h> #include "cuda.h" #define N (16) __global__ void saxpy(float* x, float* y, float a) { int i; for(i=0;i<N;i++) y[i] = a*x[i] + y[i]; return; } int main(int argc, char *argv[]) { float a = 1.5f; float *x, *y; float xcpu[N]; float ycpu[N]; int i; for(i=0;i<N;i++){ xcpu[i] = 1; ycpu[i] = 1; } for(i=0;i<N;i++) printf("%.4f\n",ycpu[i]); cudaMalloc(&x, sizeof(float)*N); cudaMalloc(&y, sizeof(float)*N); cudaMemcpy(x,xcpu,sizeof(float)*N,cudaMemcpyHostToDevice); cudaMemcpy(y,ycpu,sizeof(float)*N,cudaMemcpyHostToDevice); saxpy<<<1,1>>>(x,y,a); cudaMemcpy(xcpu,x,sizeof(float)*N,cudaMemcpyDeviceToHost); cudaMemcpy(ycpu,y,sizeof(float)*N,cudaMemcpyDeviceToHost); for(i=0;i<N;i++) printf("%.4f\n",ycpu[i]); return 0; }