-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbasic.cu
124 lines (108 loc) · 4.13 KB
/
basic.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#include "cuda_utils.h"
#include "Windows.h"
// https://stackoverflow.com/questions/44337309/whats-the-most-efficient-way-to-calculate-the-warp-id-lane-id-in-a-1-d-grid
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#special-registers
// The first problem - as @Patwie suggests - is that %warp_id does not give you what you actually want
// it's not the index of the warp in the context of the grid, but rather in the context of the physical SM
// (which can hold so many warps resident at a time), and those two are not the same.
//
// thus it's actually warp scheduler id instead of warp id.
// that's why we can observe over-subcribing using this example:
//
__forceinline__ __device__ unsigned laneid()
{
unsigned ret;
asm volatile ("mov.u32 %0, %laneid;" : "=r"(ret));
return ret;
}
__forceinline__ __device__ unsigned warpid()
{
// this is not equal to threadIdx.x / 32
unsigned ret;
asm volatile ("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
__forceinline__ __device__ unsigned nwarpid()
{
// this is not equal to threadIdx.x / 32
unsigned ret;
asm volatile ("mov.u32 %0, %nwarpid;" : "=r"(ret));
return ret;
}
struct thread_info {
unsigned blockIdx_x;
unsigned threadIdx_x;
unsigned warpid;
unsigned nwarpid;
unsigned laneid;
};
__global__ void kernel(thread_info * tinfo, int val, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
tinfo[i].blockIdx_x = blockIdx.x; // PTX: ctaid.x
tinfo[i].threadIdx_x = threadIdx.x; // PTX: tid.x
tinfo[i].warpid = warpid(); // PTX: warpid
tinfo[i].nwarpid = nwarpid(); // PTX: nwarpid
tinfo[i].laneid = laneid(); // PTX: laneid
/*
printf("src=%p, blockDim(%d,%d,%d) blockIdx.threadIdx(%d.%d,%d.%d,%d.%d) i=%d wrap.lane=%u.%u\n", src,
blockDim.x, blockDim.y, blockDim.z,
blockIdx.x, threadIdx.x,
blockIdx.y, threadIdx.y,
blockIdx.z, threadIdx.z, i,
warpid(), laneid());
*/
}
bool is_bad_read_ptr(void* src) {
// use SEH to detect bad Host pointer
// https://learn.microsoft.com/en-us/cpp/cpp/try-except-statement?view=msvc-170
__try{
*reinterpret_cast<char*>(src) = 1;
} __except (EXCEPTION_EXECUTE_HANDLER) {
std::cerr << "\tSEH happens on accessing int @ 0x" << std::hex << src << " from host!" << std::endl;
return true;
}
return false;
}
int main()
{
// Choose which GPU to run on, change this on a multi-GPU system.
ASSERT(cudaSetDevice(0) == cudaSuccess);
const int N = 16*(64*32);
const int sz = N*sizeof(thread_info); // 4MB
void *tinfo;
int val = 0;
cudaMalloc(&tinfo, sz);
std::cout << "cudaMalloc " << sz << " bytes @ 0x" << std::hex << tinfo << " is_bad_read_ptr()=" << is_bad_read_ptr(tinfo) << std::dec << std::endl;
cudaMemset(tinfo, 0, sz);
kernel << <16*2, 32*32>> > (reinterpret_cast<thread_info*>(tinfo), val, N);
ASSERT(cudaDeviceSynchronize() == cudaSuccess);
auto* ptinfo = new thread_info[N];
ASSERT(cudaMemcpy(ptinfo, tinfo, sz, cudaMemcpyDeviceToHost) == cudaSuccess);
cudaFree(tinfo);
std::cout << "nwarpid (Wraps per SM): " << ptinfo[0].nwarpid << std::endl;
for(int i = 0; i < N; i+=32) {
std::cout << "[" << i << "]: ";
std::cout << ptinfo[i].blockIdx_x << "." << std::fixed << ptinfo[i].threadIdx_x;
std::cout << " @ \t";
bool laneId_expected = true;
for(int k = i; k < i+32; k++) {
if (ptinfo[k].laneid != k-i) {
laneId_expected = false;
}
}
if (laneId_expected) {
std::cout << ptinfo[i].warpid << ": 0~31\n";
} else {
for(int k = i; k < i+32; k++) {
std::cout << ptinfo[k].warpid << "." << ptinfo[k].laneid << " ";
}
std::cout << std::endl;
}
}
TIMEIT_FINISH();
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
ASSERT(cudaDeviceReset() == cudaSuccess);
return 0;
}