@@ -39,8 +39,10 @@ void GPU_fdinfo::find_fd()
39
39
if (!driver.empty () && driver == module) {
40
40
total++;
41
41
SPDLOG_DEBUG (
42
- " driver = \" {}\" , pdev = \" {}\" , client_id = \" {}\" , client_id_exists = \" {}\" " ,
43
- driver, pdev, client_id, client_ids.find (client_id) != client_ids.end ()
42
+ " driver = \" {}\" , pdev = \" {}\" , "
43
+ " client_id = \" {}\" , client_id_exists = \" {}\" " ,
44
+ driver, pdev,
45
+ client_id, client_ids.find (client_id) != client_ids.end ()
44
46
);
45
47
}
46
48
@@ -55,7 +57,11 @@ void GPU_fdinfo::find_fd()
55
57
open_fdinfo_fd (fd_path);
56
58
}
57
59
58
- SPDLOG_DEBUG (" Found {} total fds. Opened {} unique fds." , total, fdinfo.size ());
60
+ SPDLOG_DEBUG (
61
+ " Found {} total fds. Opened {} unique fds." ,
62
+ total,
63
+ fdinfo.size ()
64
+ );
59
65
}
60
66
61
67
void GPU_fdinfo::open_fdinfo_fd (std::string path) {
@@ -109,73 +115,108 @@ float GPU_fdinfo::get_memory_used()
109
115
return (float )total / 1024 / 1024 ;
110
116
}
111
117
112
- void GPU_fdinfo::find_intel_hwmon ()
118
+ void GPU_fdinfo::find_hwmon ()
113
119
{
114
- std::string device = " /sys/bus/pci/devices/" ;
115
- device += pci_dev;
116
- device += " /hwmon" ;
120
+ std::string device = " /sys/bus/pci/devices/" + pci_dev + " /hwmon" ;
117
121
118
122
if (!fs::exists (device)) {
119
- SPDLOG_DEBUG (" Intel hwmon directory {} doesn't exist." , device);
123
+ SPDLOG_DEBUG (" hwmon: hwmon directory {} doesn't exist." , device);
120
124
return ;
121
125
}
122
126
123
127
auto dir_iterator = fs::directory_iterator (device);
124
128
auto hwmon = dir_iterator->path ().string ();
125
129
126
130
if (hwmon.empty ()) {
127
- SPDLOG_DEBUG (" Intel hwmon directory is empty." );
131
+ SPDLOG_DEBUG (" hwmon: hwmon directory is empty." );
128
132
return ;
129
133
}
130
134
131
- hwmon += module == " i915" ? " /energy1_input" : " /energy2_input" ;
135
+ for (const auto &entry : fs::directory_iterator (hwmon)) {
136
+ auto filename = entry.path ().filename ().string ();
132
137
133
- if (!fs::exists (hwmon)) {
134
- SPDLOG_DEBUG (" Intel hwmon: file {} doesn't exist." , hwmon);
135
- return ;
138
+ for (auto & hs : hwmon_sensors) {
139
+ auto key = hs.first ;
140
+ auto sensor = &hs.second ;
141
+ std::smatch matches;
142
+
143
+ if (
144
+ !std::regex_match (filename, matches, sensor->rx ) ||
145
+ matches.size () != 2
146
+ )
147
+ continue ;
148
+
149
+ auto cur_id = std::stoull (matches[1 ].str ());
150
+
151
+ if (sensor->filename .empty () || cur_id < sensor->id ) {
152
+ sensor->filename = entry.path ().string ();
153
+ sensor->id = cur_id;
154
+ }
155
+ }
136
156
}
137
157
138
- SPDLOG_DEBUG (" Intel hwmon found: hwmon = {}" , hwmon);
158
+ for (auto & hs : hwmon_sensors) {
159
+ auto key = hs.first ;
160
+ auto sensor = &hs.second ;
161
+
162
+ if (sensor->filename .empty ()) {
163
+ SPDLOG_DEBUG (" hwmon: {} reading not found at {}" , key, hwmon);
164
+ continue ;
165
+ }
139
166
140
- energy_stream. open ( hwmon);
167
+ SPDLOG_DEBUG ( " hwmon: {} reading found at {} " , key, sensor-> filename );
141
168
142
- if (!energy_stream.good ())
143
- SPDLOG_DEBUG (" Intel hwmon: failed to open {}" , hwmon);
169
+ sensor->stream .open (sensor->filename );
144
170
145
- // Initialize value for the first time, otherwise delta will be very large
146
- // and your gpu power usage will be like 1 million watts for a second.
147
- this ->last_power = get_current_power ();
171
+ if (!sensor->stream .good ()) {
172
+ SPDLOG_DEBUG (
173
+ " hwmon: failed to open {} reading {}" ,
174
+ key, sensor->filename
175
+ );
176
+ continue ;
177
+ }
178
+ }
148
179
}
149
180
150
- float GPU_fdinfo::get_current_power ()
181
+ void GPU_fdinfo::get_current_hwmon_readings ()
151
182
{
152
- if (!energy_stream.is_open ())
153
- return 0 .f ;
183
+ for (auto & hs : hwmon_sensors) {
184
+ auto key = hs.first ;
185
+ auto sensor = &hs.second ;
154
186
155
- std::string energy_input_str;
156
- uint64_t energy_input;
157
-
158
- energy_stream.seekg (0 );
187
+ if (!sensor->stream .is_open ())
188
+ continue ;
159
189
160
- std::getline (energy_stream, energy_input_str );
190
+ sensor-> stream . seekg ( 0 );
161
191
162
- if (energy_input_str. empty ())
163
- return 0 . f ;
192
+ std::stringstream ss;
193
+ ss << sensor-> stream . rdbuf () ;
164
194
165
- energy_input = std::stoull (energy_input_str);
195
+ if (ss.str ().empty ())
196
+ continue ;
166
197
167
- return (float )energy_input / 1'000'000 ;
198
+ sensor->val = std::stoull (ss.str ());
199
+ }
168
200
}
169
201
170
202
float GPU_fdinfo::get_power_usage ()
171
203
{
172
- float now = get_current_power ();
204
+ if (!hwmon_sensors[" power" ].filename .empty ())
205
+ return (float )hwmon_sensors[" power" ].val / 1'000'000 ;
206
+
207
+ float now = hwmon_sensors[" energy" ].val ;
208
+
209
+ // Initialize value for the first time, otherwise delta will be very large
210
+ // and your gpu power usage will be like 1 million watts for a second.
211
+ if (this ->last_power == 0 .f )
212
+ this ->last_power = now;
213
+
173
214
float delta = now - this ->last_power ;
174
215
delta /= (float )METRICS_UPDATE_PERIOD_MS / 1000 ;
175
216
176
217
this ->last_power = now;
177
218
178
- return delta;
219
+ return delta / 1'000'000 ;
179
220
}
180
221
181
222
int GPU_fdinfo::get_xe_load ()
@@ -349,15 +390,26 @@ void GPU_fdinfo::main_thread()
349
390
cond_var.wait (lock, [this ]() { return !paused || stop_thread; });
350
391
351
392
gather_fdinfo_data ();
393
+ get_current_hwmon_readings ();
352
394
353
395
metrics.load = get_gpu_load ();
354
396
metrics.memoryUsed = get_memory_used ();
355
397
metrics.powerUsage = get_power_usage ();
356
398
metrics.CoreClock = get_gpu_clock ();
399
+ metrics.temp = hwmon_sensors[" temp" ].val / 1000 ;
400
+ metrics.fan_speed = hwmon_sensors[" fan_speed" ].val ;
401
+ metrics.voltage = hwmon_sensors[" voltage" ].val ;
402
+ metrics.fan_rpm = true ; // Fan data is pulled from hwmon
357
403
358
404
SPDLOG_DEBUG (
359
- " pci_dev = {}, pid = {}, module = {}, load = {}, mem = {}, power = {}" ,
360
- pci_dev, pid, module, metrics.load , metrics.memoryUsed , metrics.powerUsage
405
+ " pci_dev = {}, pid = {}, module = {}, "
406
+ " load = {}, mem = {}, power = {}, "
407
+ " core = {}, temp = {}, fan = {}, "
408
+ " voltage = {}" ,
409
+ pci_dev, pid, module,
410
+ metrics.load , metrics.memoryUsed , metrics.powerUsage ,
411
+ metrics.CoreClock , metrics.temp , metrics.fan_speed ,
412
+ metrics.voltage
361
413
);
362
414
363
415
std::this_thread::sleep_for (
0 commit comments