Skip to content

Commit 0ca8464

Browse files
committed
Merge branch 'main' into dev-mode-timetravel
* main: Dev mode (#637) Hotfix: resolutions -> resolution Adds skip_checks to providers (#566) Bump pytest from 7.4.3 to 7.4.4 (#639) Disabled second PSU provider for VM tests System check providers running (#619) Stderr is now by default UTF-8 (#624) Refactored kill/killpg mechanism to be unified and actually fail on n… (#625) Command fix. Must be list append Refactorings Moved tinyproxy out of if clause Refactoring for error messages and security fix for path echoing (#636)
2 parents 7c5b35c + 807cdd4 commit 0ca8464

File tree

31 files changed

+312
-164
lines changed

31 files changed

+312
-164
lines changed

.github/workflows/tests-vm-main.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
name: 'Setup, Run, and Teardown Tests'
3535
uses: ./.github/actions/gmt-pytest
3636
with:
37-
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS'
37+
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS --providers PsuEnergyAcSdiaMachineProvider'
3838
github-token: ${{ secrets.GITHUB_TOKEN }}
3939

4040
- name: Eco CI Energy Estimation - Get Measurement

.github/workflows/tests-vm-pr.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
- name: 'Setup, Run, and Teardown Tests'
2626
uses: ./.github/actions/gmt-pytest
2727
with:
28-
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS'
28+
metrics-to-turn-off: '--categories RAPL Machine Sensors Debug CGroupV2 MacOS --providers PsuEnergyAcSdiaMachineProvider'
2929
github-token: ${{ secrets.GITHUB_TOKEN }}
3030

3131
- name: Eco CI Energy Estimation - Get Measurement

install_linux.sh

+2-4
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,12 @@ git submodule update --init
117117
print_message "Installing needed binaries for building ..."
118118
if lsb_release -is | grep -q "Fedora"; then
119119
sudo dnf -y install lm_sensors lm_sensors-devel glib2 glib2-devel tinyproxy lshw
120-
sudo systemctl stop tinyproxy
121-
sudo systemctl disable tinyproxy
122120
else
123121
sudo apt-get update
124122
sudo apt-get install -y lm-sensors libsensors-dev libglib2.0-0 libglib2.0-dev tinyproxy lshw
125-
sudo systemctl stop tinyproxy
126-
sudo systemctl disable tinyproxy
127123
fi
124+
sudo systemctl stop tinyproxy
125+
sudo systemctl disable tinyproxy
128126

129127
print_message "Building binaries ..."
130128
metrics_subdir="metric_providers"

lib/process_helpers.py

+21-29
Original file line numberDiff line numberDiff line change
@@ -2,36 +2,28 @@
22
import os
33
import subprocess
44

5-
def kill_ps(ps_to_kill):
6-
print('Killing processes')
7-
for ps_info in ps_to_kill:
8-
pid = ps_info['ps'].pid
9-
print(f"Trying to kill {ps_info['cmd']} with PID: {pid}")
10-
try:
11-
if ps_info['ps_group'] is True:
12-
try:
13-
ps_group_id = os.getpgid(pid)
14-
print(f" with process group {ps_group_id}")
15-
os.killpg(os.getpgid(pid), signal.SIGTERM)
16-
try:
17-
ps_info['ps'].wait(timeout=5)
18-
except subprocess.TimeoutExpired:
19-
# If the process hasn't gracefully exited after 5 seconds we kill it
20-
os.killpg(os.getpgid(pid), signal.SIGKILL)
21-
except ProcessLookupError:
22-
# process may be not have been in a process group
23-
print(f"Could not find process-group for {pid}")
24-
# always, just in case the calling process (typically the shell) did not die
25-
os.kill(pid, signal.SIGTERM)
26-
try:
27-
ps_info['ps'].wait(timeout=5)
28-
except subprocess.TimeoutExpired:
29-
# If the process hasn't gracefully exited after 5 seconds we kill it
30-
os.kill(pid, signal.SIGKILL)
5+
def kill_pg(ps, cmd):
6+
pgid = os.getpgid(ps.pid)
7+
print(f"Trying to kill {cmd} with PGID: {pgid}")
8+
9+
os.killpg(pgid, signal.SIGTERM)
10+
try:
11+
ps.wait(timeout=10)
12+
except subprocess.TimeoutExpired as exc:
13+
# If the process hasn't gracefully exited after 5 seconds we kill it
14+
os.killpg(pgid, signal.SIGKILL)
15+
raise RuntimeError(f"Killed the process {cmd} with SIGKILL. This could lead to corrupted data!") from exc
3116

32-
except ProcessLookupError:
33-
# process may already have ended or been killed in the process group
34-
print(f"Could not find process {pid}")
17+
def kill_ps(ps, cmd):
18+
pid = ps.pid
19+
print(f"Trying to kill {cmd} with PID: {pid}")
20+
21+
os.kill(pid, signal.SIGTERM)
22+
try:
23+
ps.wait(timeout=10)
24+
except subprocess.TimeoutExpired as exc:
25+
os.kill(pid, signal.SIGKILL)
26+
raise RuntimeError(f"Killed the process {cmd} with SIGKILL. This could lead to corrupted data!") from exc
3527

3628
# currently unused
3729
def timeout(process, cmd: str, duration: int):

lib/system_checks.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from enum import Enum
1313
import subprocess
1414
import psutil
15+
import locale
1516

1617
from psycopg import OperationalError as psycopg_OperationalError
1718

@@ -68,6 +69,8 @@ def check_docker_daemon():
6869
check=False, encoding='UTF-8')
6970
return result.returncode == 0
7071

72+
def check_utf_encoding():
73+
return locale.getpreferredencoding().lower() == sys.getdefaultencoding().lower() == 'utf-8'
7174

7275
######## END CHECK FUNCTIONS ########
7376

@@ -78,7 +81,8 @@ def check_docker_daemon():
7881
(check_free_disk, Status.ERROR, '1GB free hdd space', 'We recommend to free up some disk space'),
7982
(check_free_memory, Status.ERROR, 'free memory', 'No free memory! Please kill some programs'),
8083
(check_docker_daemon, Status.ERROR, 'docker daemon', 'The docker daemon could not be reached. Are you running in rootless mode or have added yourself to the docker group? See installation: [See https://docs.green-coding.berlin/docs/installation/]'),
81-
(check_containers_running, Status.WARN, 'Running containers', 'You have other containers running on the system. This is usually what you want in local development, but for undisturbed measurements consider going for a measurement cluster [See https://docs.green-coding.berlin/docs/installation/installation-cluster/].'),
84+
(check_containers_running, Status.WARN, 'running containers', 'You have other containers running on the system. This is usually what you want in local development, but for undisturbed measurements consider going for a measurement cluster [See https://docs.green-coding.berlin/docs/installation/installation-cluster/].'),
85+
(check_utf_encoding, Status.ERROR, 'utf file encoding', 'Your system encoding is not set to utf-8. This is needed as we need to parse console output.'),
8286

8387
]
8488

metric_providers/base.py

+32-22
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
import os
22
from pathlib import Path
33
import subprocess
4-
import signal
5-
import sys
64
from io import StringIO
75
import pandas
86

97
from lib.system_checks import ConfigurationCheckError
8+
from lib import process_helpers
109

1110
class MetricProviderConfigurationError(ConfigurationCheckError):
1211
pass
@@ -22,7 +21,8 @@ def __init__(
2221
current_dir,
2322
metric_provider_executable='metric-provider-binary',
2423
sudo=False,
25-
disable_buffer=True
24+
disable_buffer=True,
25+
skip_check=False,
2626
):
2727
self._metric_name = metric_name
2828
self._metrics = metrics
@@ -34,6 +34,7 @@ def __init__(
3434
self._has_started = False
3535
self._disable_buffer = disable_buffer
3636
self._rootless = None
37+
self._skip_check = skip_check
3738

3839
self._tmp_folder = '/tmp/green-metrics-tool'
3940
self._ps = None
@@ -43,16 +44,37 @@ def __init__(
4344

4445
self._filename = f"{self._tmp_folder}/{self._metric_name}.log"
4546

46-
self.check_system()
47+
if not self._skip_check:
48+
self.check_system()
4749

4850
# this is the default function that will be overridden in the children
4951
def check_system(self):
50-
pass
52+
cmd = ['pgrep', '-f', self._metric_provider_executable]
53+
result = subprocess.run(cmd,
54+
stdout=subprocess.PIPE,
55+
stderr=subprocess.PIPE,
56+
check=False, encoding='UTF-8')
57+
if result.returncode == 1:
58+
return True
59+
if result.returncode == 0:
60+
raise MetricProviderConfigurationError(f"Another instance of the {self._metric_name} metrics provider is already running on the system!\nPlease close it before running the Green Metrics Tool.")
61+
# implicit else
62+
raise subprocess.CalledProcessError(result.stderr, cmd)
5163

5264
# implemented as getter function and not direct access, so it can be overloaded
5365
# some child classes might not actually have _ps attribute set
66+
#
67+
# This function has to go through quite some hoops to read the stderr
68+
# The preferred way to communicate with processes is through communicate()
69+
# However this function ALWAYS waits for the process to terminate and it does not allow reading from processes
70+
# in chunks while they are running. Thus we we cannot set encoding='UTF-8' in Popen and must decode here.
5471
def get_stderr(self):
55-
return self._ps.stderr.read()
72+
stderr_read = ''
73+
if self._ps.stderr is not None:
74+
stderr_read = self._ps.stderr.read()
75+
if isinstance(stderr_read, bytes):
76+
stderr_read = stderr_read.decode('utf-8')
77+
return stderr_read
5678

5779
def has_started(self):
5880
return self._has_started
@@ -132,7 +154,9 @@ def start_profiling(self, containers=None):
132154
[call_string],
133155
shell=True,
134156
preexec_fn=os.setsid,
135-
stderr=subprocess.PIPE
157+
stderr=subprocess.PIPE,
158+
#encoding='UTF-8' # we cannot set this option here as reading later will then flake with "can't concat NoneType to bytes"
159+
# see get_stderr() for additional details
136160
# since we are launching the command with shell=True we cannot use ps.terminate() / ps.kill().
137161
# This would just kill the executing shell, but not it's child and make the process an orphan.
138162
# therefore we use os.setsid here and later call os.getpgid(pid) to get process group that the shell
@@ -146,20 +170,6 @@ def start_profiling(self, containers=None):
146170
def stop_profiling(self):
147171
if self._ps is None:
148172
return
149-
try:
150-
print(f"Killing process with id: {self._ps.pid}")
151-
ps_group_id = os.getpgid(self._ps.pid)
152-
print(f" and process group {ps_group_id}")
153-
os.killpg(ps_group_id, signal.SIGTERM)
154-
try:
155-
self._ps.wait(timeout=5)
156-
except subprocess.TimeoutExpired:
157-
# If the process hasn't gracefully exited after 5 seconds we kill it
158-
os.killpg(ps_group_id, signal.SIGKILL)
159-
print("Killed the process with SIGKILL. This could lead to corrupted metric log files!")
160-
161-
except ProcessLookupError:
162-
print(f"Could not find process-group for {self._ps.pid}",
163-
file=sys.stderr) # process/-group may have already closed
164173

174+
process_helpers.kill_pg(self._ps, self._metric_provider_executable)
165175
self._ps = None

metric_providers/cpu/energy/RAPL/MSR/component/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class CpuEnergyRaplMsrComponentProvider(BaseMetricProvider):
6-
def __init__(self, resolution):
6+
def __init__(self, resolution, skip_check=False):
77
super().__init__(
88
metric_name='cpu_energy_rapl_msr_component',
99
metrics={'time': int, 'value': int, 'package_id': str},
1010
resolution=resolution,
1111
unit='mJ',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check=skip_check,
1314
)

metric_providers/cpu/frequency/sysfs/core/provider.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,20 @@
33
from metric_providers.base import MetricProviderConfigurationError, BaseMetricProvider
44

55
class CpuFrequencySysfsCoreProvider(BaseMetricProvider):
6-
def __init__(self, resolution):
6+
def __init__(self, resolution, skip_check=False):
77
super().__init__(
88
metric_name='cpu_frequency_sysfs_core',
99
metrics={'time': int, 'value': int, 'core_id': int},
1010
resolution=0.001*resolution,
1111
unit='Hz',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
1313
metric_provider_executable='get-scaling-cur-freq.sh',
14+
skip_check=skip_check,
1415
)
1516

1617
def check_system(self):
18+
super().check_system()
19+
1720
file_path = "/sys/devices/system/cpu/cpu0/cpufreq/scaling_cur_freq"
1821
if os.path.exists(file_path):
1922
try:

metric_providers/cpu/time/cgroup/container/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class CpuTimeCgroupContainerProvider(BaseMetricProvider):
6-
def __init__(self, resolution, rootless=False):
6+
def __init__(self, resolution, rootless=False, skip_check=False):
77
super().__init__(
88
metric_name='cpu_time_cgroup_container',
99
metrics={'time': int, 'value': int, 'container_id': str},
1010
resolution=resolution,
1111
unit='us',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check=skip_check,
1314
)
1415
self._rootless = rootless

metric_providers/cpu/time/cgroup/system/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class CpuTimeCgroupSystemProvider(BaseMetricProvider):
6-
def __init__(self, resolution):
6+
def __init__(self, resolution, skip_check=False):
77
super().__init__(
88
metric_name='cpu_time_cgroup_system',
99
metrics={'time': int, 'value': int},
1010
resolution=resolution,
1111
unit='us',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check=skip_check,
1314
)

metric_providers/cpu/time/procfs/system/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class CpuTimeProcfsSystemProvider(BaseMetricProvider):
6-
def __init__(self, resolution):
6+
def __init__(self, resolution, skip_check=False):
77
super().__init__(
88
metric_name='cpu_time_procfs_system',
99
metrics={'time': int, 'value': int},
1010
resolution=resolution,
1111
unit='us',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check = skip_check
1314
)

metric_providers/cpu/utilization/cgroup/container/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class CpuUtilizationCgroupContainerProvider(BaseMetricProvider):
6-
def __init__(self, resolution, rootless=False):
6+
def __init__(self, resolution, rootless=False, skip_check=False):
77
super().__init__(
88
metric_name='cpu_utilization_cgroup_container',
99
metrics={'time': int, 'value': int, 'container_id': str},
1010
resolution=resolution,
1111
unit='Ratio',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check = skip_check,
1314
)
1415
self._rootless = rootless

metric_providers/cpu/utilization/procfs/system/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,12 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class CpuUtilizationProcfsSystemProvider(BaseMetricProvider):
6-
def __init__(self, resolution):
6+
def __init__(self, resolution, skip_check=False):
77
super().__init__(
88
metric_name='cpu_utilization_procfs_system',
99
metrics={'time': int, 'value': int},
1010
resolution=resolution,
1111
unit='Ratio',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check = skip_check,
1314
)

metric_providers/lm_sensors/abstract_provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def _create_options(self):
2222
return ['-c'] + [f"'{i}'" for i in provider_config['chips']] \
2323
+ ['-f'] + [f"'{i}'" for i in provider_config['features']]
2424

25-
def __init__(self, metric_name, resolution, unit):
25+
def __init__(self, metric_name, resolution, unit, skip_check=False):
2626
if __name__ == '__main__':
2727
# If you run this on the command line you will need to set this in the config
2828
# This is separate so it is always clear what config is used.
@@ -35,5 +35,6 @@ def __init__(self, metric_name, resolution, unit):
3535
resolution=resolution,
3636
unit=unit,
3737
current_dir=os.path.dirname(os.path.abspath(__file__)),
38+
skip_check=skip_check,
3839
)
3940
self._extra_switches = self._create_options()

metric_providers/memory/energy/RAPL/MSR/component/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44

55

66
class MemoryEnergyRaplMsrComponentProvider(BaseMetricProvider):
7-
def __init__(self, resolution):
7+
def __init__(self, resolution, skip_check=False):
88
super().__init__(
99
metric_name='memory_energy_rapl_msr_component',
1010
metrics={'time': int, 'value': int, 'package_id': str},
1111
resolution=resolution,
1212
unit='mJ',
1313
current_dir=os.path.dirname(os.path.abspath(__file__)),
14+
skip_check=skip_check,
1415
)
1516
self._extra_switches = ['-d']

metric_providers/memory/total/cgroup/container/provider.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,13 @@
33
from metric_providers.base import BaseMetricProvider
44

55
class MemoryTotalCgroupContainerProvider(BaseMetricProvider):
6-
def __init__(self, resolution, rootless=False):
6+
def __init__(self, resolution, rootless=False, skip_check=False):
77
super().__init__(
88
metric_name='memory_total_cgroup_container',
99
metrics={'time': int, 'value': int, 'container_id': str},
1010
resolution=resolution,
1111
unit='Bytes',
1212
current_dir=os.path.dirname(os.path.abspath(__file__)),
13+
skip_check=skip_check,
1314
)
1415
self._rootless = rootless

0 commit comments

Comments
 (0)