From b0175c9fb68883362f5230c3361643d11af84f02 Mon Sep 17 00:00:00 2001 From: Taewon Kim Date: Fri, 15 Aug 2025 00:20:56 +0000 Subject: [PATCH] add default log rotation for Slurm daemon logs --- .../LifecycleScripts/base-config/config.py | 2 ++ .../base-config/lifecycle_script.py | 3 +++ .../utils/enable_slurm_log_rotation.sh | 27 +++++++++++++++++++ 3 files changed, 32 insertions(+) create mode 100755 1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py index 1aee3b103..8d74c9ea3 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/config.py @@ -32,6 +32,8 @@ class Config: # Set true if you want to use FSx OpenZFS in addition to FSxL. enable_fsx_openzfs = False + # Set false if you want to disable log rotation of Slurm daemon logs + enable_slurm_log_rotation = True s3_bucket = "" # required when enable_mount_s3 = True, replace with your actual data bucket name in quotes, ie. "my-dataset-bucket" diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py index a3c68ac13..bb5992d2b 100644 --- a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/lifecycle_script.py @@ -271,6 +271,9 @@ def main(args): if Config.enable_mount_s3: ExecuteBashScript("./utils/mount-s3.sh").run(Config.s3_bucket) + if Config.enable_slurm_log_rotation: + ExecuteBashScript("./utils/enable_slurm_log_rotation.sh").run() + print("[INFO]: Success: All provisioning scripts completed") diff --git a/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh new file mode 100755 index 000000000..e2042e3c4 --- /dev/null +++ b/1.architectures/5.sagemaker-hyperpod/LifecycleScripts/base-config/utils/enable_slurm_log_rotation.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +LOGROTATE_CONF_FILEPATH="/etc/logrotate.d/sagemaker-hyperpod-slurm" + +echo "[$(hostname)] Adding Slurm log rotation configuration to ${LOGROTATE_CONF_FILEPATH}" + +cat <>${LOGROTATE_CONF_FILEPATH} +"/var/log/slurm/*.log" { + rotate 2 + size 50M + copytruncate + nocompress + + missingok + nodelaycompress + nomail + notifempty + noolddir + sharedscripts + postrotate + pkill -x --signal SIGUSR2 slurmctld + pkill -x --signal SIGUSR2 slurmd + pkill -x --signal SIGUSR2 slurmdbd + exit 0 + endscript +} +EOF