-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmax_gflops.cu
122 lines (103 loc) · 6.25 KB
/
max_gflops.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include "cuda_utils.h"
/*
nvcc --generate-line-info --keep --keep-dir build -O2 max_gflops.cu
################## 91% peak performance is reached ##################
## because we use (32,32) as thread block size
## we need M >= 32*(num_SM) = 512 to fill all SM work to do and reach peak performance
$ M=1024 K=4096000 N=32 ./a.exe
+0.000 ms cuda_utils.h:20 CUDADevice() cudaSetDevice(0)
cudaGetDeviceProperties(..., 0 ) :
totalGlobalMem : 8589672448 (8191 MB)
sharedMemPerBlock : 49152
regsPerBlock : 65536
warpSize : 32
memPitch : 2147483647
maxThreadsPerBlock : 1024
totalConstMem : 65536
major : 6
minor : 1
clockRate : 1645000(KHz)
multiProcessorCount : 16 (each SM has 128 CUDA-cores)
kernelExecTimeoutEnabled: 1
integrated : 0
canMapHostMemory : 1
computeMode : 0
... peak performance : 6.738(TFLOP/s)
ENV: M = 1024
ENV: K = 4096000
ENV: N = 32
+103.462 ms max_gflops.cu:225 main() test_max_gflops(1024,32,4096000)
gridDim=dim3{1, 32, 1} blockDim=dim3{32, 32, 1}
ENV: CUDATIMEIT =
cuda_timeit #0 test_max_gflops:119 sgemm_max_gflops x 3 0(bytes) 4294967296000(flops)
[AutoCUDATimer # 0] @host 5.750 ms | @device (+ 0.000 us) 734.235 ms 5.850 TFLOP/s sgemm_max_gflops (test_max_gflops:0) 0.000 MB 4294.967 Gflops
[AutoCUDATimer # 0] @host 3.900 us | @device (+ 3.072 us) 699.881 ms 6.137 TFLOP/s sgemm_max_gflops (test_max_gflops:0) 0.000 MB 4294.967 Gflops
[AutoCUDATimer # 0] @host 1.800 us | @device (+ 2.048 us) 699.818 ms 6.137 TFLOP/s sgemm_max_gflops (test_max_gflops:0) 0.000 MB 4294.967 Gflops
+2.136 sec max_gflops.cu:142 test_max_gflops() ==========SM statistics:==========
+0.022 ms max_gflops.cu:149 test_max_gflops() SM 0 clock_range = 439668072864~440811145023 duration: 1143072159 elements: 2048 FMA:117 (FMA/cycle)
+0.013 ms max_gflops.cu:149 test_max_gflops() SM 1 clock_range = 441001454700~442144564657 duration: 1143109957 elements: 2048 FMA:117 (FMA/cycle)
+0.009 ms max_gflops.cu:149 test_max_gflops() SM 2 clock_range = 440748883407~441891746127 duration: 1142862720 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 3 clock_range = 440444968141~441588077414 duration: 1143109273 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 4 clock_range = 439668072844~440811101779 duration: 1143028935 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 5 clock_range = 441001454690~442144563310 duration: 1143108620 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 6 clock_range = 440748883389~441891895607 duration: 1143012218 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 7 clock_range = 440444968133~441588097121 duration: 1143128988 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 8 clock_range = 439668072867~440811221853 duration: 1143148986 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 9 clock_range = 441001454712~442144533160 duration: 1143078448 elements: 2048 FMA:117 (FMA/cycle)
+0.008 ms max_gflops.cu:149 test_max_gflops() SM 10 clock_range = 440748883420~441891767179 duration: 1142883759 elements: 2048 FMA:117 (FMA/cycle)
+0.007 ms max_gflops.cu:149 test_max_gflops() SM 11 clock_range = 440444968163~441588103576 duration: 1143135413 elements: 2048 FMA:117 (FMA/cycle)
+0.007 ms max_gflops.cu:149 test_max_gflops() SM 12 clock_range = 439668072851~440811098886 duration: 1143026035 elements: 2048 FMA:117 (FMA/cycle)
+0.007 ms max_gflops.cu:149 test_max_gflops() SM 13 clock_range = 441001454711~442144599108 duration: 1143144397 elements: 2048 FMA:117 (FMA/cycle)
+0.007 ms max_gflops.cu:149 test_max_gflops() SM 14 clock_range = 440748883410~441891787516 duration: 1142904106 elements: 2048 FMA:117 (FMA/cycle)
+0.007 ms max_gflops.cu:149 test_max_gflops() SM 15 clock_range = 440444968157~441588130951 duration: 1143162794 elements: 2048 FMA:117 (FMA/cycle)
+0.125 ms max_gflops.cu:151 test_max_gflops() clock_overall_dur = 1143162794
+0.015 ms max_gflops.cu:152 test_max_gflops() GPU_avg_frequency = 1.60712 (GHz)
+0.008 ms max_gflops.cu:153 test_max_gflops() average FMA = 117.409 (FMA/SM/cycle)
+0.007 ms max_gflops.cu:154 test_max_gflops() average FMA = 3019.05 (GFMA/s)
+11.174 ms misc.hpp:535 ~ChromeTraceDumpper() dumpped 169.52(KB) to ct.json
+0.519 ms cuda_utils.h:46 ~CUDADevice() cudaDeviceReset()
*/
#define FMACNT 32
// mapping between threadIdx & (x,y) are transposed
__global__ void sgemm_max_gflops(thread_info * tinfo, size_t M, size_t N, size_t K) {
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (y < M && x < N) {
float tmp[FMACNT] = {0};
float a = y;
float b = x;
auto off = tinfo->start();
for (int k = 0; k < K; ++k) {
for(int c = 0; c < FMACNT; c++)
tmp[c] = fma(a, b, tmp[c]);
}
tinfo->end(off);
// to prevent optimization
if (tmp[0] == 1.2134f) {
float csum = 0;
for(int c = 0; c < FMACNT; c++)
csum += tmp[c];
printf("%f", csum);
}
}
}
void test_max_gflops(size_t M, size_t N, size_t K) {
tensorND<thread_info> C({M, N}, thread_info());
dim3 gridDim(CEIL_DIV(N, WRAP_SIZE), CEIL_DIV(M, WRAP_SIZE));
dim3 blockDim(WRAP_SIZE, WRAP_SIZE);
std::cout << "gridDim=" << gridDim << " blockDim=" << blockDim << std::endl;
auto flops = M*N*K*2.0*FMACNT;
auto avg_dur_ns = cuda_timeit([&](int i, std::stringstream& ss){
sgemm_max_gflops<<<gridDim, blockDim>>>(C.to_gpu(), M, N, K);
}, __func__, __LINE__, "sgemm_max_gflops", 0, flops, 3);
thread_info::dump(C.to_cpu(), C.numel(), avg_dur_ns, 0, K*2.0*FMACNT);
}
int main() {
CUDADevice dev(0);
auto M = getenv("M", 512);
auto K = getenv("K", 4096000);
auto N = getenv("N", 32);
ECOUT("test_max_gflops(", M, ",", N, ",", K, ")");
test_max_gflops(M, N, K);
return 0;
}