Skip to content

Commit 5f8ee8a

Browse files
17314642retrixe
andcommitted
gpu_fdinfo: add driver-agnostic hwmon support
This commit removes the hardcoding of specific hwmon sensor IDs to that of i915 and Xe KMD, thus making `find_intel_hwmon` vendor-independent. Instead, it iterates through available hwmon sensors and selects the first available sensor. This should allow fan speed and temp monitoring to work out of the box on Xe DRM in the future as well, once Xe DRM exposes the necessary interfaces via hwmon. Co-authored-by: Ibrahim Ansari <ansari.ibrahim1@gmail.com>
1 parent d09f284 commit 5f8ee8a

File tree

2 files changed

+113
-41
lines changed

2 files changed

+113
-41
lines changed

src/gpu_fdinfo.cpp

+88-36
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@ void GPU_fdinfo::find_fd()
3939
if (!driver.empty() && driver == module) {
4040
total++;
4141
SPDLOG_DEBUG(
42-
"driver = \"{}\", pdev = \"{}\", client_id = \"{}\", client_id_exists = \"{}\"",
43-
driver, pdev, client_id, client_ids.find(client_id) != client_ids.end()
42+
"driver = \"{}\", pdev = \"{}\", "
43+
"client_id = \"{}\", client_id_exists = \"{}\"",
44+
driver, pdev,
45+
client_id, client_ids.find(client_id) != client_ids.end()
4446
);
4547
}
4648

@@ -55,7 +57,11 @@ void GPU_fdinfo::find_fd()
5557
open_fdinfo_fd(fd_path);
5658
}
5759

58-
SPDLOG_DEBUG("Found {} total fds. Opened {} unique fds.", total, fdinfo.size());
60+
SPDLOG_DEBUG(
61+
"Found {} total fds. Opened {} unique fds.",
62+
total,
63+
fdinfo.size()
64+
);
5965
}
6066

6167
void GPU_fdinfo::open_fdinfo_fd(std::string path) {
@@ -109,73 +115,108 @@ float GPU_fdinfo::get_memory_used()
109115
return (float)total / 1024 / 1024;
110116
}
111117

112-
void GPU_fdinfo::find_intel_hwmon()
118+
void GPU_fdinfo::find_hwmon()
113119
{
114-
std::string device = "/sys/bus/pci/devices/";
115-
device += pci_dev;
116-
device += "/hwmon";
120+
std::string device = "/sys/bus/pci/devices/" + pci_dev + "/hwmon";
117121

118122
if (!fs::exists(device)) {
119-
SPDLOG_DEBUG("Intel hwmon directory {} doesn't exist.", device);
123+
SPDLOG_DEBUG("hwmon: hwmon directory {} doesn't exist.", device);
120124
return;
121125
}
122126

123127
auto dir_iterator = fs::directory_iterator(device);
124128
auto hwmon = dir_iterator->path().string();
125129

126130
if (hwmon.empty()) {
127-
SPDLOG_DEBUG("Intel hwmon directory is empty.");
131+
SPDLOG_DEBUG("hwmon: hwmon directory is empty.");
128132
return;
129133
}
130134

131-
hwmon += module == "i915" ? "/energy1_input" : "/energy2_input";
135+
for (const auto &entry : fs::directory_iterator(hwmon)) {
136+
auto filename = entry.path().filename().string();
132137

133-
if (!fs::exists(hwmon)) {
134-
SPDLOG_DEBUG("Intel hwmon: file {} doesn't exist.", hwmon);
135-
return;
138+
for (auto& hs : hwmon_sensors) {
139+
auto key = hs.first;
140+
auto sensor = &hs.second;
141+
std::smatch matches;
142+
143+
if (
144+
!std::regex_match(filename, matches, sensor->rx) ||
145+
matches.size() != 2
146+
)
147+
continue;
148+
149+
auto cur_id = std::stoull(matches[1].str());
150+
151+
if (sensor->filename.empty() || cur_id < sensor->id) {
152+
sensor->filename = entry.path().string();
153+
sensor->id = cur_id;
154+
}
155+
}
136156
}
137157

138-
SPDLOG_DEBUG("Intel hwmon found: hwmon = {}", hwmon);
158+
for (auto& hs : hwmon_sensors) {
159+
auto key = hs.first;
160+
auto sensor = &hs.second;
161+
162+
if (sensor->filename.empty()) {
163+
SPDLOG_DEBUG("hwmon: {} reading not found at {}", key, hwmon);
164+
continue;
165+
}
139166

140-
energy_stream.open(hwmon);
167+
SPDLOG_DEBUG("hwmon: {} reading found at {}", key, sensor->filename);
141168

142-
if (!energy_stream.good())
143-
SPDLOG_DEBUG("Intel hwmon: failed to open {}", hwmon);
169+
sensor->stream.open(sensor->filename);
144170

145-
// Initialize value for the first time, otherwise delta will be very large
146-
// and your gpu power usage will be like 1 million watts for a second.
147-
this->last_power = get_current_power();
171+
if (!sensor->stream.good()) {
172+
SPDLOG_DEBUG(
173+
"hwmon: failed to open {} reading {}",
174+
key, sensor->filename
175+
);
176+
continue;
177+
}
178+
}
148179
}
149180

150-
float GPU_fdinfo::get_current_power()
181+
void GPU_fdinfo::get_current_hwmon_readings()
151182
{
152-
if (!energy_stream.is_open())
153-
return 0.f;
183+
for (auto& hs : hwmon_sensors) {
184+
auto key = hs.first;
185+
auto sensor = &hs.second;
154186

155-
std::string energy_input_str;
156-
uint64_t energy_input;
157-
158-
energy_stream.seekg(0);
187+
if (!sensor->stream.is_open())
188+
continue;
159189

160-
std::getline(energy_stream, energy_input_str);
190+
sensor->stream.seekg(0);
161191

162-
if (energy_input_str.empty())
163-
return 0.f;
192+
std::stringstream ss;
193+
ss << sensor->stream.rdbuf();
164194

165-
energy_input = std::stoull(energy_input_str);
195+
if (ss.str().empty())
196+
continue;
166197

167-
return (float)energy_input / 1'000'000;
198+
sensor->val = std::stoull(ss.str());
199+
}
168200
}
169201

170202
float GPU_fdinfo::get_power_usage()
171203
{
172-
float now = get_current_power();
204+
if (!hwmon_sensors["power"].filename.empty())
205+
return (float)hwmon_sensors["power"].val / 1'000'000;
206+
207+
float now = hwmon_sensors["energy"].val;
208+
209+
// Initialize value for the first time, otherwise delta will be very large
210+
// and your gpu power usage will be like 1 million watts for a second.
211+
if (this->last_power == 0.f)
212+
this->last_power = now;
213+
173214
float delta = now - this->last_power;
174215
delta /= (float)METRICS_UPDATE_PERIOD_MS / 1000;
175216

176217
this->last_power = now;
177218

178-
return delta;
219+
return delta / 1'000'000;
179220
}
180221

181222
int GPU_fdinfo::get_xe_load()
@@ -349,15 +390,26 @@ void GPU_fdinfo::main_thread()
349390
cond_var.wait(lock, [this]() { return !paused || stop_thread; });
350391

351392
gather_fdinfo_data();
393+
get_current_hwmon_readings();
352394

353395
metrics.load = get_gpu_load();
354396
metrics.memoryUsed = get_memory_used();
355397
metrics.powerUsage = get_power_usage();
356398
metrics.CoreClock = get_gpu_clock();
399+
metrics.temp = hwmon_sensors["temp"].val / 1000;
400+
metrics.fan_speed = hwmon_sensors["fan_speed"].val;
401+
metrics.voltage = hwmon_sensors["voltage"].val;
402+
metrics.fan_rpm = true; // Fan data is pulled from hwmon
357403

358404
SPDLOG_DEBUG(
359-
"pci_dev = {}, pid = {}, module = {}, load = {}, mem = {}, power = {}",
360-
pci_dev, pid, module, metrics.load, metrics.memoryUsed, metrics.powerUsage
405+
"pci_dev = {}, pid = {}, module = {}, "
406+
"load = {}, mem = {}, power = {}, "
407+
"core = {}, temp = {}, fan = {}, "
408+
"voltage = {}",
409+
pci_dev, pid, module,
410+
metrics.load, metrics.memoryUsed, metrics.powerUsage,
411+
metrics.CoreClock, metrics.temp, metrics.fan_speed,
412+
metrics.voltage
361413
);
362414

363415
std::this_thread::sleep_for(

src/gpu_fdinfo.h

+25-5
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,32 @@
11
#pragma once
2+
#include <cstdint>
23
#include <filesystem.h>
34
#include <inttypes.h>
45
#include <sys/stat.h>
6+
#include <sys/types.h>
57
#include <unistd.h>
68
#include <thread>
9+
710
#ifdef TEST_ONLY
811
#include <../src/mesa/util/os_time.h>
912
#else
1013
#include <mesa/util/os_time.h>
1114
#endif
15+
1216
#include "gpu_metrics_util.h"
1317
#include <atomic>
1418
#include <spdlog/spdlog.h>
1519
#include <map>
1620
#include <set>
21+
#include <regex>
22+
23+
struct hwmon_sensor {
24+
std::regex rx;
25+
std::ifstream stream;
26+
std::string filename;
27+
unsigned char id = 0;
28+
uint64_t val = 0;
29+
};
1730

1831
class GPU_fdinfo {
1932
private:
@@ -33,7 +46,8 @@ class GPU_fdinfo {
3346
mutable std::mutex metrics_mutex;
3447

3548
std::vector<std::ifstream> fdinfo;
36-
std::ifstream energy_stream;
49+
50+
std::map<std::string, hwmon_sensor> hwmon_sensors;
3751

3852
std::string drm_engine_type = "EMPTY";
3953
std::string drm_memory_type = "EMPTY";
@@ -56,8 +70,9 @@ class GPU_fdinfo {
5670

5771
float get_memory_used();
5872

59-
void find_intel_hwmon();
60-
float get_current_power();
73+
void find_hwmon();
74+
void get_current_hwmon_readings();
75+
6176
float get_power_usage();
6277
float last_power = 0;
6378

@@ -106,8 +121,13 @@ class GPU_fdinfo {
106121
drm_engine_type, drm_memory_type
107122
);
108123

109-
if (module == "i915" || module == "xe")
110-
find_intel_hwmon();
124+
hwmon_sensors["voltage"] = { .rx = std::regex("in(\\d+)_input") };
125+
hwmon_sensors["fan_speed"] = { .rx = std::regex("fan(\\d+)_input") };
126+
hwmon_sensors["temp"] = { .rx = std::regex("temp(\\d+)_input") };
127+
hwmon_sensors["power"] = { .rx = std::regex("power(\\d+)_input") };
128+
hwmon_sensors["energy"] = { .rx = std::regex("energy(\\d+)_input") };
129+
130+
find_hwmon();
111131

112132
if (module == "i915")
113133
find_i915_gt_dir();

0 commit comments

Comments
 (0)