http://www.gdep.jp/page/view/251
matrix_cpu.c
#include <stdio.h> #include <stdlib.h> #include <malloc.h> #include <time.h> /*n正方行列のサイズを定義*/ #define MATRIX_SIZE 1024 int main(int argc, char** argv){ unsigned int col_idx, row_idx, scan_idx; int* matA; int* matB; int* matC; /*タイマー作成*/ time_t Start, Stop; /*int型のn×n領域をメモリに確保*/ matA = (int*)malloc(sizeof(int) * MATRIX_SIZE * MATRIX_SIZE); matB = (int*)malloc(sizeof(int) * MATRIX_SIZE * MATRIX_SIZE); matC = (int*)malloc(sizeof(int) * MATRIX_SIZE * MATRIX_SIZE); for (col_idx = 0; col_idx < MATRIX_SIZE; col_idx++) { for (row_idx = 0; row_idx < MATRIX_SIZE; row_idx++) { matA[col_idx * MATRIX_SIZE + row_idx] = rand() % (MATRIX_SIZE * MATRIX_SIZE); matB[col_idx * MATRIX_SIZE + row_idx] = rand() % (MATRIX_SIZE * MATRIX_SIZE); matC[col_idx * MATRIX_SIZE + row_idx] = 0; } } time(&Start); for (col_idx = 0; col_idx < MATRIX_SIZE; col_idx++) { for (row_idx = 0; row_idx < MATRIX_SIZE; row_idx++) { for (scan_idx = 0; scan_idx < MATRIX_SIZE; scan_idx++) { matC[col_idx * MATRIX_SIZE + row_idx] += matA[col_idx * MATRIX_SIZE + scan_idx] * matB[scan_idx * MATRIX_SIZE + row_idx]; } } } time(&Stop); printf("Processing time: %d (sec)\n", Stop - Start); /*メモリを解放*/ free(matA); free(matB); free(matC); return 0; }
matrix_gpu.c
#include <stdio.h> #include <malloc.h> #include <stdlib.h> #include <time.h> #include <cutil_inline.h> #define MATRIX_SIZE 1024/*行列1辺の数*/ #define BLOCK_SIZE 16 __global__ void matrixMul(int* inMatrixA, int* inMatrixB, int* inMatrixC); int main(int argc, char** argv){ unsigned int matrixSize = sizeof(unsigned int) * MATRIX_SIZE * MATRIX_SIZE; int* hMatrixA; int* hMatrixB; int* hMatrixC; hMatrixA = (int*)malloc(matrixSize); hMatrixB = (int*)malloc(matrixSize); /*初期値設定*/ unsigned int col_idx, row_idx; for (col_idx = 0; col_idx < MATRIX_SIZE; col_idx++){ for (row_idx = 0; row_idx < MATRIX_SIZE; row_idx++){ hMatrixA[col_idx * MATRIX_SIZE + row_idx] = rand() % (1024*1024); hMatrixB[col_idx * MATRIX_SIZE + row_idx] = rand() % (1024*1024); } } /*デバイス側の変数設定*/ int* dMatrixA; int* dMatrixB; int* dMatrixC; /*デバイスメモリ領域の確保*/ cutilSafeCall(cudaMalloc((void**)&dMatrixA, matrixSize)); cutilSafeCall(cudaMemcpy(dMatrixA, hMatrixA, matrixSize, cudaMemcpyHostToDevice)); cutilSafeCall(cudaMalloc((void**)&dMatrixB, matrixSize)); cutilSafeCall(cudaMemcpy(dMatrixB, hMatrixB, matrixSize, cudaMemcpyHostToDevice)); cutilSafeCall(cudaMalloc((void**)&dMatrixC, matrixSize)); /*ブロックサイズとグリッドサイズの設定*/ dim3 block(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(MATRIX_SIZE/BLOCK_SIZE, MATRIX_SIZE/BLOCK_SIZE); /*タイマーを作成して計測開始*/ unsigned int timer = 0; CUT_SAFE_CALL( cutCreateTimer( &timer)); CUT_SAFE_CALL( cutStartTimer( timer)); /*カーネルの起動*/ matrixMul<<<grid, block>>>(dMatrixA, dMatrixB, dMatrixC); cudaThreadSynchronize(); /*結果の領域確保とデバイス側からのメモリ転送*/ hMatrixC = (int*)malloc(matrixSize); cutilSafeCall(cudaMemcpy(hMatrixC, dMatrixC, matrixSize, cudaMemcpyDeviceToHost)); /*タイマーを停止しかかった時間を表示*/ CUT_SAFE_CALL( cutStopTimer( timer)); printf("Processing time: %f (msec)\n", cutGetTimerValue( timer)); CUT_SAFE_CALL( cutDeleteTimer( timer)); /*ホスト・デバイスメモリの開放*/ free(hMatrixA); free(hMatrixB); free(hMatrixC); cutilSafeCall(cudaFree(dMatrixA)); cutilSafeCall(cudaFree(dMatrixB)); cutilSafeCall(cudaFree(dMatrixC)); /*終了処理*/ cudaThreadExit(); cutilExit(argc, argv); } __global__ void matrixMul(int* inMatrixA, int* inMatrixB, int* inMatrixC){ unsigned int col_idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int row_idx = blockIdx.y * blockDim.y + threadIdx.y; unsigned int scan_idx; unsigned int target = 0; /*行列の演算を行う*/ for (scan_idx = 0; scan_idx < MATRIX_SIZE; scan_idx++) { target +=inMatrixA[col_idx * MATRIX_SIZE + scan_idx] * inMatrixB[scan_idx * MATRIX_SIZE + row_idx]; __syncthreads(); } inMatrixC[col_idx * MATRIX_SIZE + row_idx] = target; }
nvcc -I/usr/include/malloc -o cpu matrix_cpu.c nvcc -I/usr/include/malloc -o gpu matrix_gpu.c
GPUの方がビルドできない
バージョンが上がってヘッダファイルの構成がかわっているらしい