1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | #include <stdio.h> #include <stdlib.h> #include <sys/time.h> #include <cuda_runtime.h> #define WIDTH 1536 #define TILE_W 32 float M[WIDTH][WIDTH] = {0}; float N[WIDTH][WIDTH] = {0}; float P[WIDTH][WIDTH] = {0}; float MxN[WIDTH][WIDTH] = {0}; __global__ void MatMulKernel(float *Ad, float *Bd, float *AxBd, int SIZE, int iter); void MatMul(float *M, float *N, float *P, int width); int main(int argc, char *argv[]) { int width = WIDTH; int pass = 1; for (int i = 0; i < width; ++i) { for (int j = 0; j < width; ++j) { M[i][j] = rand() % 30; N[i][j] = rand() % 30; } } struct timeval starttime, endtime; gettimeofday(&starttime, NULL); for (int i = 0; i < width; ++i) { for (int j = 0; j < width; ++j) { for (int k = 0; k < width; ++k) { MxN[i][j] += M[i][k] * N[k][j]; } } } gettimeofday(&endtime, NULL); double executime; executime = (endtime.tv_sec - starttime.tv_sec) * 1000.0; executime += (endtime.tv_usec - starttime.tv_usec) / 1000.0; printf("CPU time: %13lf msec\n", executime); MatMul((float *)M, (float *)N, (float *)P, width); for (int i = 0; i < width; ++i) { for (int j = 0; j < width; ++j) { if(MxN[i][j] != P[i][j]) { printf("MxN[%d][%d] = %2.0f P[%d][%d] = %2.0f\n", i, j, MxN[i][j], i, j, P[i][j]); pass = 0; } } } printf("Test %s\n", (pass)?"PASSED":"FAILED"); return 0; } __global__ void MatMulKernel(float *Ad, float *Bd, float *AxBd, int SIZE, int iter){ int i = threadIdx.y + blockIdx.y*blockDim.y; int j = threadIdx.x + blockIdx.x*blockDim.x; // 我個人比較喜歡上述寫法 int k; float AxB_ij = 0.0f; // 沒有初始化的話, iter=0時會有問題 if(iter == 0){ if (i < SIZE && j < SIZE) { // 超出SIZE的部份不執行,否則答案會錯 for(k=0; k<SIZE; k++){ AxB_ij += Ad[i*SIZE + k] * Bd[k*SIZE +j]; } AxBd[i*SIZE +j] = AxB_ij; } } else { while (i<SIZE){ j = threadIdx.x + blockIdx.x*blockDim.x; // i遞增完之後, 要記得重設j的初始位置 while (j<SIZE){ AxB_ij = 0.0f; for(k=0; k<SIZE; k++){ AxB_ij += Ad[i*SIZE+k] * Bd[k*SIZE+j]; } AxBd[i*SIZE +j] = AxB_ij; j += blockDim.x * gridDim.x; } i += blockDim.y * gridDim.y; } } } // Matrix multiplication - Host code void MatMul(float *M, float *N, float *P, int width) { size_t size = width * width * sizeof(float); float *Md, *Nd, *Pd; // Allocate and Load M, N to device memory cudaMalloc((void **)&Md, size); cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice); cudaMalloc((void **)&Nd, size); cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice); // Allocate P on the device cudaMalloc((void **)&Pd, size); // Setup the execution configuration int ratio = width / TILE_W; int residue = width % TILE_W; int kerIter; int blockSize, thSize; if (ratio < TILE_W){ if(residue==0) blockSize = (int) ratio; else blockSize = ((int) (width / TILE_W)) + 1; kerIter = 0; }else{ blockSize = TILE_W; kerIter = 1; } dim3 block(blockSize, blockSize); if(TILE_W > width) thSize = width; else thSize = TILE_W; dim3 threads(thSize, thSize); // Get start time event cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // Invoke kernel MatMulKernel<<<block, threads>>>(Md, Nd, Pd, width, kerIter); cudaError_t cuda_err = cudaGetLastError(); if ( cudaSuccess != cuda_err ){ printf("before kernel call: error = %s\n", cudaGetErrorString (cuda_err)); exit(1) ; } // Get stop time event cudaEventRecord(stop, 0); cudaEventSynchronize(stop); // Compute execution time float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); printf("GPU time: %13f msec\n", elapsedTime); cudaEventDestroy(start); cudaEventDestroy(stop); // Read P from device memory cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost); // Free device memory cudaFree(Md); cudaFree(Nd); cudaFree(Pd); } |
Direct link: https://paste.plurk.com/show/408923