1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda_runtime.h>

#define WIDTH 1536
#define TILE_W 32

float M[WIDTH][WIDTH] = {0};
float N[WIDTH][WIDTH] = {0};
float P[WIDTH][WIDTH] = {0};
float MxN[WIDTH][WIDTH] = {0};

__global__ void MatMulKernel(float *Ad, float *Bd, float *AxBd, int SIZE, int iter);
void MatMul(float *M, float *N, float *P, int width);


int main(int argc, char *argv[])
{
    int width = WIDTH;
    int pass = 1;
    
    for (int i = 0; i < width; ++i) {
        for (int j = 0; j < width; ++j) {
            M[i][j] = rand() % 30;
            N[i][j] = rand() % 30;
        }
    }
    
    struct timeval starttime, endtime;
    gettimeofday(&starttime, NULL);
    for (int i = 0; i < width; ++i) {
        for (int j = 0; j < width; ++j) {
            for (int k = 0; k < width; ++k) {
                MxN[i][j] += M[i][k] * N[k][j];
            }
        }
    }
    gettimeofday(&endtime, NULL);
    double executime;
    executime = (endtime.tv_sec - starttime.tv_sec) * 1000.0;
    executime += (endtime.tv_usec - starttime.tv_usec) / 1000.0;
    printf("CPU time: %13lf msec\n", executime);
    
    MatMul((float *)M, (float *)N, (float *)P, width);
    
    for (int i = 0; i < width; ++i) {
        for (int j = 0; j < width; ++j) {
            if(MxN[i][j] != P[i][j]) {
                printf("MxN[%d][%d] = %2.0f   P[%d][%d] = %2.0f\n", i, j, MxN[i][j], i, j, P[i][j]);
                pass = 0;
            }
        }
    }
    
    printf("Test %s\n", (pass)?"PASSED":"FAILED");
    
    return 0;
}

__global__ void MatMulKernel(float *Ad, float *Bd, float *AxBd, int SIZE, int iter){
    int i = threadIdx.y + blockIdx.y*blockDim.y;
    int j = threadIdx.x + blockIdx.x*blockDim.x;
    // 我個人比較喜歡上述寫法
    int k;
    float AxB_ij = 0.0f;  // 沒有初始化的話, iter=0時會有問題
    if(iter == 0){
        if (i < SIZE && j < SIZE) {  // 超出SIZE的部份不執行,否則答案會錯
            for(k=0; k<SIZE; k++){
                AxB_ij += Ad[i*SIZE + k] * Bd[k*SIZE +j];
            }
            AxBd[i*SIZE +j] = AxB_ij;
        }
    }
    else {
        while (i<SIZE){
            j = threadIdx.x + blockIdx.x*blockDim.x;  // i遞增完之後, 要記得重設j的初始位置
            while (j<SIZE){
                AxB_ij = 0.0f;
                for(k=0; k<SIZE; k++){
                    AxB_ij += Ad[i*SIZE+k] * Bd[k*SIZE+j];
                }
                AxBd[i*SIZE +j] = AxB_ij;
                j += blockDim.x * gridDim.x;
            }
            i += blockDim.y * gridDim.y;
        }
    }
}

// Matrix multiplication - Host code
void MatMul(float *M, float *N, float *P, int width)
{
    size_t size = width * width * sizeof(float);
    float *Md, *Nd, *Pd;
    
    // Allocate and Load M, N to device memory
    cudaMalloc((void **)&Md, size);
    cudaMemcpy(Md, M, size, cudaMemcpyHostToDevice);
    
    cudaMalloc((void **)&Nd, size);
    cudaMemcpy(Nd, N, size, cudaMemcpyHostToDevice);
    
    // Allocate P on the device
    cudaMalloc((void **)&Pd, size);
    
    // Setup the execution configuration
    int ratio = width / TILE_W;
    int residue = width % TILE_W;
    int kerIter;
    int blockSize, thSize;
    if (ratio < TILE_W){
        if(residue==0)
            blockSize = (int) ratio;
        else
            blockSize = ((int) (width / TILE_W)) + 1;
        kerIter = 0;
    }else{
        blockSize = TILE_W;
        kerIter = 1;
    }
    dim3 block(blockSize, blockSize);

    if(TILE_W > width)
        thSize = width;
    else
        thSize = TILE_W;
    dim3 threads(thSize, thSize);
    
    // Get start time event
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    
    // Invoke kernel
    MatMulKernel<<<block, threads>>>(Md, Nd, Pd, width, kerIter);
    cudaError_t cuda_err = cudaGetLastError();
    if ( cudaSuccess != cuda_err ){
        printf("before kernel call: error = %s\n", cudaGetErrorString (cuda_err));
        exit(1) ;
    }
    
    // Get stop time event
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    // Compute execution time
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    printf("GPU time: %13f msec\n", elapsedTime);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    // Read P from device memory
    cudaMemcpy(P, Pd, size, cudaMemcpyDeviceToHost);
    
    // Free device memory
    cudaFree(Md);
    cudaFree(Nd);
    cudaFree(Pd);
}