From cbd27c8fbcc3b9efc4e79c4eb1aedde80e90af84 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 6 Mar 2024 10:26:25 -0800 Subject: [PATCH 01/36] add a prompt when removing nodes from the cluster --- bin/remove_nodes_prompt.txt | 4 ++++ bin/resize.sh | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) create mode 100644 bin/remove_nodes_prompt.txt diff --git a/bin/remove_nodes_prompt.txt b/bin/remove_nodes_prompt.txt new file mode 100644 index 00000000..dcc6792d --- /dev/null +++ b/bin/remove_nodes_prompt.txt @@ -0,0 +1,4 @@ +Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs? +If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc. +Once data is backed up or migrated, come back and run the script. Select 2 to exit. +Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it. \ No newline at end of file diff --git a/bin/resize.sh b/bin/resize.sh index 92dea986..1080ca25 100755 --- a/bin/resize.sh +++ b/bin/resize.sh @@ -15,6 +15,12 @@ then exit fi +if [ $USER != "ubuntu" ] && [ $USER != "opc" ] +then + echo "Run this script as opc or ubuntu" + exit +fi + if [ $# -eq 0 ] then python3 $folder/resize.py --help @@ -51,6 +57,18 @@ for (( i=1; i<=$#; i++)); do fi done +if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ] +then + echo "$(cat $folder/remove_nodes_prompt.txt)" + echo "Do you confirm you have done all of the above steps and wish to proceed for the termination of the nodes? Enter 1 for Yes and 2 for No (to exit)." + select yn in "Yes" "No"; do + case $yn in + Yes ) break;; + No ) exit;; + esac + done +fi + if [ $resize_type != "default" ] then if [ $permanent -eq 0 ] @@ -148,5 +166,5 @@ then rm currently_resizing fi else - python3 $folder/resize.py ${@} + python3 $folder/resize.py ${@} & fi From da709849b6a7e77d1558219e3f5ec95db7684b34 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 15 Mar 2024 17:03:33 -0700 Subject: [PATCH 02/36] in progress for adding silent mode --- autoscaling/crontab/autoscale_slurm.sh | 6 +++--- bin/delete_cluster.sh | 2 +- bin/resize.py | 1 + bin/resize.sh | 6 +++++- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index 9882b44c..b08e639e 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -364,7 +364,7 @@ try: initial_nodes=[] unreachable_nodes=[] if cluster_name == "NOCLUSTERFOUND": - subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name]) + subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet') continue for node in nodes_to_destroy[cluster_name]: try: @@ -376,9 +376,9 @@ try: except: unreachable_nodes.append(node) if len(initial_nodes) > 0: - subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes) + subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet') if len(unreachable_nodes) > 0: - subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes) + subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet') time.sleep(1) for index,cluster in enumerate(cluster_to_build): diff --git a/bin/delete_cluster.sh b/bin/delete_cluster.sh index 7328a206..983e10a3 100755 --- a/bin/delete_cluster.sh +++ b/bin/delete_cluster.sh @@ -103,7 +103,7 @@ else for node in `scontrol show hostname $nodes 2>&1` do echo "Cleaning up node " $node - /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node + /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node --quiet done fi cd diff --git a/bin/resize.py b/bin/resize.py index 9525fee4..8710b61f 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -577,6 +577,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False) parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False) parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False) +parser.add_argument('--quiet', help='If present, the script will not prompt for a response when removing nodes and will not give a reminder to save data from nodes that are being removed ',action='store_true',default=False) args = parser.parse_args() diff --git a/bin/resize.sh b/bin/resize.sh index 1080ca25..cbcb4232 100755 --- a/bin/resize.sh +++ b/bin/resize.sh @@ -32,6 +32,7 @@ permanent=1 controllerName=`hostname` cluster_name=${controllerName/-controller/} nodes=NULL +quietMode=False for (( i=1; i<=$#; i++)); do if [ ${!i} == "--cluster_name" ] then @@ -54,10 +55,13 @@ for (( i=1; i<=$#; i++)); do then j=$((i+1)) nodes=${@:j} + elif [ ${!i} == "--quiet" ] + then + quietMode=True fi done -if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ] +if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ] && [ $quietMode == "False" ] then echo "$(cat $folder/remove_nodes_prompt.txt)" echo "Do you confirm you have done all of the above steps and wish to proceed for the termination of the nodes? Enter 1 for Yes and 2 for No (to exit)." From 599bf403f9da4736658bfc8e99bf9dfa93d4e14e Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 15 Mar 2024 17:10:43 -0700 Subject: [PATCH 03/36] fix for resize.sh remove failing when no nfs is defined --- bin/resize.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/resize.py b/bin/resize.py index 9525fee4..9c9f7217 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -447,7 +447,10 @@ def getNFSnode(inventory): return '' if len(dict['nfs']) == 0: return '' - return dict['nfs'][0].split()[0] + if dict['nfs'][0] == '\n': + return '' + else: + return dict['nfs'][0].split()[0] def get_summary(comp_ocid,cluster_name): CN = "CN" From 39b1db4abded7af025ca88552c09645f4b3caf6a Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 19 Mar 2024 13:38:37 -0700 Subject: [PATCH 04/36] added the check for autoscaling value in /etc/ansible/hosts --- autoscaling/crontab/autoscale_slurm.sh | 224 +++++++++++++------------ 1 file changed, 120 insertions(+), 104 deletions(-) diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index b08e639e..8ccde4ef 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -91,8 +91,8 @@ def getIdleTime(node): return ( datetime.datetime.now() - right_time ).total_seconds() # Get the last time a node state was changed. This is used to get how long a cluster has been idle for -def getQueueConf(file): - with open(queues_conf_file) as file: +def getQueueConf(queue_file): + with open(queue_file) as file: try: data = yaml.load(file,Loader=yaml.FullLoader) except: @@ -328,109 +328,125 @@ def getstatus_slurm(): cluster_destroying.append(clusterName) return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes -if os.path.isfile(lockfile): - print( "Lockfile "+lockfile + " is present, exiting" ) - exit() -open(lockfile,'w').close() -try: - path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0]))) - clusters_path = os.path.join(path,'clusters') - config = getQueueConf(queues_conf_file) - - cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm() - - print (time.strftime("%Y-%m-%d %H:%M:%S")) - print (cluster_to_build,'cluster_to_build') - print (cluster_to_destroy,'cluster_to_destroy') - print (nodes_to_destroy,'nodes_to_destroy') - print (cluster_building,'cluster_building') - print (cluster_destroying,'cluster_destroying') - print (current_nodes,'current_nodes') - print (building_nodes,'building_nodes') - - for i in cluster_building: - for j in cluster_to_build: - if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]: - cluster_to_build.remove(j) - break - for cluster in cluster_to_destroy: - cluster_name=cluster[0] - print ("Deleting cluster "+cluster_name) - subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name]) - time.sleep(5) - - for cluster_name in nodes_to_destroy.keys(): - print ("Resizing cluster "+cluster_name) - initial_nodes=[] - unreachable_nodes=[] - if cluster_name == "NOCLUSTERFOUND": - subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet') - continue - for node in nodes_to_destroy[cluster_name]: +def getAutoscaling(): + out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + output = stdout.split("\n") + autoscaling_value=False + for i in range(0,len(output)-1): + autoscaling_value=output[i] + return autoscaling_value + +autoscaling = getAutoscaling() + +if autoscaling == "true": + + if os.path.isfile(lockfile): + print( "Lockfile "+lockfile + " is present, exiting" ) + exit() + open(lockfile,'w').close() + try: + path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0]))) + clusters_path = os.path.join(path,'clusters') + config = getQueueConf(queues_conf_file) + + cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm() + + print (time.strftime("%Y-%m-%d %H:%M:%S")) + print (cluster_to_build,'cluster_to_build') + print (cluster_to_destroy,'cluster_to_destroy') + print (nodes_to_destroy,'nodes_to_destroy') + print (cluster_building,'cluster_building') + print (cluster_destroying,'cluster_destroying') + print (current_nodes,'current_nodes') + print (building_nodes,'building_nodes') + + for i in cluster_building: + for j in cluster_to_build: + if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]: + cluster_to_build.remove(j) + break + for cluster in cluster_to_destroy: + cluster_name=cluster[0] + print ("Deleting cluster "+cluster_name) + subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name]) + time.sleep(5) + + for cluster_name in nodes_to_destroy.keys(): + print ("Resizing cluster "+cluster_name) + initial_nodes=[] + unreachable_nodes=[] + if cluster_name == "NOCLUSTERFOUND": + subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet') + continue + for node in nodes_to_destroy[cluster_name]: + try: + alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True) + for alt_name in alt_names.split("\n")[0].split(): + if alt_name.startswith('inst-'): + initial_nodes.append(alt_name) + break + except: + unreachable_nodes.append(node) + if len(initial_nodes) > 0: + subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet') + if len(unreachable_nodes) > 0: + subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet') + time.sleep(1) + + for index,cluster in enumerate(cluster_to_build): + nodes=cluster[0] + instance_type = cluster[1] + queue=cluster[2] + jobID=str(cluster[3]) + user=str(cluster[4]) + jobconfig=getJobConfig(config,queue,instance_type) + limits=getQueueLimits(config,queue,instance_type) try: - alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True) - for alt_name in alt_names.split("\n")[0].split(): - if alt_name.startswith('inst-'): - initial_nodes.append(alt_name) - break + clusterCount=len(used_index[queue][instance_type]) except: - unreachable_nodes.append(node) - if len(initial_nodes) > 0: - subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet') - if len(unreachable_nodes) > 0: - subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet') - time.sleep(1) - - for index,cluster in enumerate(cluster_to_build): - nodes=cluster[0] - instance_type = cluster[1] - queue=cluster[2] - jobID=str(cluster[3]) - user=str(cluster[4]) - jobconfig=getJobConfig(config,queue,instance_type) - limits=getQueueLimits(config,queue,instance_type) - try: - clusterCount=len(used_index[queue][instance_type]) - except: - clusterCount=0 - if clusterCount>=limits["max_cluster_count"]: - print ("This would go over the number of running clusters, you have reached the max number of clusters") - continue - nextIndex=None - if clusterCount==0: - if queue in used_index.keys(): - used_index[queue][instance_type]=[1] + clusterCount=0 + if clusterCount>=limits["max_cluster_count"]: + print ("This would go over the number of running clusters, you have reached the max number of clusters") + continue + nextIndex=None + if clusterCount==0: + if queue in used_index.keys(): + used_index[queue][instance_type]=[1] + else: + used_index[queue]={instance_type:[1]} + nextIndex=1 else: - used_index[queue]={instance_type:[1]} - nextIndex=1 - else: - for i in range(1,10000): - if not i in used_index[queue][instance_type]: - nextIndex=i - used_index[queue][instance_type].append(i) - break - clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"] - if not queue in current_nodes.keys(): - current_nodes[queue]={instance_type:0} - else: - if not instance_type in current_nodes[queue].keys(): - current_nodes[queue][instance_type]=0 - if not queue in building_nodes.keys(): - building_nodes[queue]={instance_type:0} - else: - if not instance_type in building_nodes[queue].keys(): - building_nodes[queue][instance_type]=0 - if nodes > limits["max_cluster_size"]: - print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit") - elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]: - print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit") - else: - current_nodes[queue][instance_type]+=nodes - clusterCount+=1 - print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes") - subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user]) - time.sleep(5) + for i in range(1,10000): + if not i in used_index[queue][instance_type]: + nextIndex=i + used_index[queue][instance_type].append(i) + break + clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"] + if not queue in current_nodes.keys(): + current_nodes[queue]={instance_type:0} + else: + if not instance_type in current_nodes[queue].keys(): + current_nodes[queue][instance_type]=0 + if not queue in building_nodes.keys(): + building_nodes[queue]={instance_type:0} + else: + if not instance_type in building_nodes[queue].keys(): + building_nodes[queue][instance_type]=0 + if nodes > limits["max_cluster_size"]: + print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit") + elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]: + print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit") + else: + current_nodes[queue][instance_type]+=nodes + clusterCount+=1 + print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes") + subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user]) + time.sleep(5) -except Exception: - traceback.print_exc() -os.remove(lockfile) + except Exception: + traceback.print_exc() + os.remove(lockfile) +else: + print("Autoscaling is false") + exit() From 2d8e3bacbc922ef0a949f1e4faac1b4a1000d5d4 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 19 Mar 2024 15:06:35 -0700 Subject: [PATCH 05/36] Update Readme --- README.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 6320d108..3685991c 100644 --- a/README.md +++ b/README.md @@ -104,12 +104,12 @@ optional arguments: --ansible_crucial If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed - --remove_unreachable If present, nodes that are not sshable will be removed - from the config. They will however not be removed from - Slurm to avoid losing track of the down nodes. If you - need to remove them from Slurm after terminating the - nodes in the console. Run sudo scontrol update - nodename=name state=Future + --remove_unreachable If present, nodes that are not sshable will be terminated + before running the action that was requested + (Example Adding a node) + --quiet If present, the script will not prompt for a response when + removing nodes and will not give a reminder to save data + from nodes that are being removed ``` **Add nodes** @@ -161,6 +161,13 @@ Remove 3 nodes randomly from compute-1-hpc: ``` /opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc +``` +or +Remove 3 nodes randomly from compute-1-hpc but do not prompt for a response when removing the nodes and do not give a reminder to save data +from nodes that are being removed : +``` +/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc --quiet + ``` **Reconfigure nodes** @@ -208,6 +215,10 @@ Uncomment the line in `crontab -e`: ``` * * * * * /opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm.log 2>&1 ``` +And in /etc/ansible/hosts, below value should be true +``` +autoscaling = true +``` # Submit How to submit jobs: From 48eb7cc2de46583e43950c1bb4f9129886400167 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Mon, 25 Mar 2024 15:27:01 -0600 Subject: [PATCH 06/36] Fix visibility of Private Zone Name --- schema.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/schema.yaml b/schema.yaml index be651072..e48577fd 100755 --- a/schema.yaml +++ b/schema.yaml @@ -1062,8 +1062,11 @@ variables: title: Private Zone Name description: "The zone needs to be private for the stack to be able to add entries" type: string - visible: ${use_existing_vcn} - required: true + visible: + and: + - ${dns_entries} + - ${use_existing_vcn} + required: ${dns_entries} vcn_subnet: type: string title: "VCN IP range" From cfb330d4ff7ed48144a50f5757661868726a030d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 26 Mar 2024 22:06:59 -0600 Subject: [PATCH 07/36] Update terraform provider version --- autoscaling/tf_init/versions.tf | 2 +- versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 57e63004..28a169ec 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "5.30.0" + version = "5.34.0" } } } \ No newline at end of file diff --git a/versions.tf b/versions.tf index 57e63004..28a169ec 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "5.30.0" + version = "5.34.0" } } } \ No newline at end of file From eb84c37c7e42a9ef867199d0959346364e7315bb Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 26 Mar 2024 22:08:33 -0600 Subject: [PATCH 08/36] Add healthcheck in prolog --- autoscaling/tf_init/controller_update.tf | 3 +- autoscaling/tf_init/inventory.tpl | 3 +- conf/variables.tpl | 1 + controller.tf | 6 +- inventory.tpl | 3 +- playbooks/new_nodes.yml | 2 + playbooks/resize_add.yml | 2 + .../healthchecks/files/check_h100_setup.py | 507 ++++++++++++++++++ .../roles/healthchecks/files/gpu_bw_test.py | 167 ++++++ .../healthchecks/files/rdma_link_flapping.py | 171 ++++++ .../healthchecks/files/shared_logging.py | 5 + .../roles/healthchecks/files/xid_checker.py | 200 +++++++ playbooks/roles/healthchecks/tasks/main.yml | 22 + playbooks/roles/slurm/files/healthchecks.sh | 11 + playbooks/roles/slurm/tasks/common.yml | 20 +- playbooks/roles/slurm/tasks/common_pyxis.yml | 12 +- .../templates/{prolog.sh.j2 => pyxis.sh.j2} | 0 playbooks/roles/slurm/templates/slurm.conf.j2 | 4 +- playbooks/site.yml | 2 + schema.yaml | 8 + slurm_ha.tf | 6 +- variables.tf | 4 +- 22 files changed, 1146 insertions(+), 13 deletions(-) create mode 100644 playbooks/roles/healthchecks/files/check_h100_setup.py create mode 100644 playbooks/roles/healthchecks/files/gpu_bw_test.py create mode 100644 playbooks/roles/healthchecks/files/rdma_link_flapping.py create mode 100644 playbooks/roles/healthchecks/files/shared_logging.py create mode 100644 playbooks/roles/healthchecks/files/xid_checker.py create mode 100755 playbooks/roles/healthchecks/tasks/main.yml create mode 100644 playbooks/roles/slurm/files/healthchecks.sh rename playbooks/roles/slurm/templates/{prolog.sh.j2 => pyxis.sh.j2} (100%) diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf index 5d58f76e..ec4ec5ac 100755 --- a/autoscaling/tf_init/controller_update.tf +++ b/autoscaling/tf_init/controller_update.tf @@ -77,7 +77,8 @@ resource "local_file" "inventory" { compute_username = var.compute_username, pam = var.pam, sacct_limits = var.sacct_limits, - use_compute_agent=var.use_compute_agent + use_compute_agent=var.use_compute_agent, + healthchecks=var.healthchecks }) filename = "${local.controller_path}/inventory" } diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 56c20cb9..24a2355d 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -73,4 +73,5 @@ pam = ${pam} sacct_limits=${sacct_limits} use_compute_agent=${use_compute_agent} zone_name=${zone_name} -dns_entries=${dns_entries} \ No newline at end of file +dns_entries=${dns_entries} +healthchecks=${healthchecks} \ No newline at end of file diff --git a/conf/variables.tpl b/conf/variables.tpl index 97fc9eb2..c8bde956 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -31,6 +31,7 @@ variable "private_subnet" {default = "##PRIVATE_SUBNET##"} variable "rdma_subnet" { default = "${rdma_subnet}" } variable "zone_name" {default = "${zone_name}"} variable "dns_entries" {default = "${dns_entries}"} +variable "healthchecks" {default = "${healthchecks}"} variable "slurm" { default = ${slurm} } variable "rack_aware" { default = ${rack_aware} } variable "pyxis" { default = ${pyxis} } diff --git a/controller.tf b/controller.tf index b215ab82..ad45196e 100644 --- a/controller.tf +++ b/controller.tf @@ -297,7 +297,8 @@ resource "null_resource" "cluster" { region = var.region, tenancy_ocid = var.tenancy_ocid, api_fingerprint = var.api_fingerprint, - api_user_ocid = var.api_user_ocid + api_user_ocid = var.api_user_ocid, + healthchecks = var.healthchecks }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -445,7 +446,8 @@ resource "null_resource" "cluster" { virt_instr = var.virt_instr, access_ctrl = var.access_ctrl, numa_nodes_per_socket = var.numa_nodes_per_socket, - percentage_of_cores_enabled = var.percentage_of_cores_enabled + percentage_of_cores_enabled = var.percentage_of_cores_enabled, + healthchecks = var.healthchecks }) destination = "/opt/oci-hpc/conf/variables.tf" diff --git a/inventory.tpl b/inventory.tpl index f39e534e..c0824ecd 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -78,4 +78,5 @@ api_user_ocid = ${api_user_ocid} sacct_limits=${sacct_limits} use_compute_agent=${use_compute_agent} zone_name=${zone_name} -dns_entries=${dns_entries} \ No newline at end of file +dns_entries=${dns_entries} +healthchecks=${healthchecks} \ No newline at end of file diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index b160873f..39efe46f 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -54,6 +54,8 @@ when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem + - include_role: + name: healthchecks - hosts: controller,slurm_backup,login,compute become: true diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 09be3ecd..47b7071b 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -52,6 +52,8 @@ when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem + - include_role: + name: healthchecks - hosts: controller,slurm_backup,login,compute become: true diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_h100_setup.py new file mode 100644 index 00000000..99b0498d --- /dev/null +++ b/playbooks/roles/healthchecks/files/check_h100_setup.py @@ -0,0 +1,507 @@ +#!/usr/bin/env python3 + +import subprocess +import re +import argparse +from datetime import datetime +from shared_logging import logger +from gpu_bw_test import BandwidthTest +from rdma_link_flapping import LinkFlappingTest +from xid_checker import XidChecker +import platform +import os +import sys + +def is_user_root(): + # Check if the user is root + if os.geteuid() != 0: + logger.debug("User is root") + return False + return True + +def get_oca_version(): + # Run the shell command + os_name = platform.system() + + + if os_name == 'Linux': + try: + distro = platform.linux_distribution()[0] + except: + import distro + distro = distro.name() + + if 'Ubuntu' in distro: + if not is_user_root(): + result = subprocess.run(['sudo', 'snap', 'info', 'oracle-cloud-agent'], stdout=subprocess.PIPE) + else: + result = subprocess.run(['snap', 'info', 'oracle-cloud-agent'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Define the regular expression pattern for the version + pattern = r'installed:\s+(\d+\.\d+\.\d+)' + match = re.search(pattern, output) + if match: + version = match.group(1) + + elif 'Oracle' in distro: + result = subprocess.run(['rpm', '-qa'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Define the regular expression pattern for the version + pattern = r'oracle-cloud-agent-(\d+\.\d+\.\d+)' + match = re.search(pattern, output) + if match: + version = match.group(1) + + + if version < "1.39.0": + logger.error(f"Oracle Cloud Agent: {version} needs to be updated to 1.38.0 or higher") + else: + logger.info(f"Oracle Cloud Agent: {version}") + + # Return the version + return version + +def check_rttcc_status(): + link_status = [] + devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + status = "disabled" + status_dict = {"devices": {}} + for device in devices: + if not is_user_root(): + command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] + else: + command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] + result = subprocess.run(command, stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + filtered_output = [line for line in output.split('\n') if line.startswith('value')] + for line in filtered_output: + logger.debug(line) + if "0x00000001" in line: + status_dict["devices"][device] = "enabled" + + for device in status_dict["devices"]: + if status_dict["devices"][device] == "enabled": + logger.warning(f"RTTCC enabled on {device}") + status = "enabled" + link_status.append(f"RTTCC enabled on: {device}") + else: + logger.info(f"RTTCC status for {device}: disabled") + if status == "disabled": + logger.info(f"RTTCC disabled check: Passed") + else: + logger.error(f"RTTCC disabled check: Failed") + + return link_status + +def check_ecc_errors(): + ecc_issues = [] + try: + # Run the nvidia-smi -q command + result = subprocess.run(['nvidia-smi', '-q'], stdout=subprocess.PIPE) + except FileNotFoundError: + logger.warning("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found") + return [] + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Find the lines containing "SRAM Correctable" and "DRAM Correctable" + sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output) + if len(sram_matches)==0: + sram_matches = re.findall(r'SRAM Uncorrectable SEC-DED\s+:\s+(\d+)', output) + dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output) + gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output) + vol_sram_line = sram_matches[0::2] + vol_dram_line = dram_matches[0::2] + agg_sram_line = sram_matches[1::2] + agg_dram_line = dram_matches[1::2] + + for i, gpu in enumerate(gpu_matches): + logger.debug(f"GPU: {gpu}") + if vol_sram_line[i] != "0": + logger.debug(f"Volatile SRAM Uncorrectable: {vol_sram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Volatile SRAM Uncorrectable: {vol_sram_line[i]}") + if vol_dram_line[i] != "0": + logger.debug(f"Volatile DRAM Uncorrectable: {vol_dram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Volatile DRAM Uncorrectable: {vol_dram_line[i]}") + if agg_sram_line[i] != "0": + logger.debug(f"Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Aggregate SRAM Uncorrectable: {agg_sram_line[i]}") + if agg_dram_line[i] != "0": + logger.debug(f"Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") + ecc_issues.append(f"{gpu_matches[i]} - Aggregate DRAM Uncorrectable: {agg_dram_line[i]}") + + + # Check if there are ecc_issues + if len(ecc_issues) == 0: + logger.info("GPU ECC Test: Passed") + else: + logger.warning("GPU ECC Test: Failed") + + return ecc_issues + +def check_row_remap_errors(): + remap_issues = [] + try: + # Run the nvidia-smi -q command + result = subprocess.run(['nvidia-smi', '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable', '--format=csv,noheader'], stdout=subprocess.PIPE) + + if result.returncode != 0: + logger.debug(f"Check row remap command exited with error code: {result.returncode}") + + except FileNotFoundError: + logger.warning("Skipping Row Remap Test: nvidia-smi command not found") + return [] + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + logger.debug("Output: {}".format(output)) + for i, line in enumerate(output.split('\n')): + if line == "": + continue + tmp_data = line.split(",") + tmp_data = [x.strip() for x in tmp_data] + if tmp_data[0] != "0": + logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}") + remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}") + if tmp_data[1] != "0": + logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}") + #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}") + if tmp_data[2] != "0": + logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}") + if int(tmp_data[2]) > 512: + remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}") + else: + remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable <512: {tmp_data[2]}")# Check if there are ecc_issues + + if len(remap_issues) == 0: + logger.info("GPU Remap Test: Passed") + else: + logger.warning("GPU Remap Test: Failed") + + return remap_issues + +def check_rdma_link_status(): + status = True + devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + + link_issues = [] + for device in devices: + # Run the mlxlink command + if not is_user_root(): + command = ['sudo', 'mlxlink', '-d', device, '-m', '-c', '-e'] + else: + command = ['mlxlink', '-d', device, '-m', '-c', '-e'] + result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + stderr = result.stderr.decode('utf-8') + + if stderr and stderr.find("-E-") != -1: + stderr = stderr.split("\n") + stderr_line = ", ".join(stderr) + logger.debug(f"{device}: {stderr_line}") + link_issues.append(f"{device}: {stderr[0]}") + status = "False" + continue + + # Find the line containing "Recommendation" + color_pattern = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])') + link_state = re.search(r'\nState.*', output).group().split(":")[1].strip() + recommendation = re.search(r'Recommendation.*', output).group().split(":")[1].strip() + vendor_serial_num = re.search(r'Vendor Serial Number.*', output).group().split(":")[1].strip() + nic_fw_version = re.search(r'Firmware Version.*', output).group().split(":")[1].strip() + cable_fw_version = re.search(r'FW Version.*', output).group().split(":")[1].strip() + + # Remove hidden characters from the output + link_state = re.sub(color_pattern, '', link_state) + nic_fw_version = re.sub(color_pattern, '', nic_fw_version) + recommendation = re.sub(color_pattern, '', recommendation) + + logger.debug(f"{device}: {vendor_serial_num} - {cable_fw_version} - {nic_fw_version} - {link_state} - {recommendation}") + + # Extract the part after the ":" and print it along with the device name + if link_state != "Active": + logger.debug(f"{device}: {link_state}") + link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {link_state}") + status = False + if recommendation != "No issue was observed": + logger.debug(f"{device}: {recommendation}") + link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}") + status = False + else: + logger.debug(f"{device}: {recommendation}") + + if status: + logger.info(f"RDMA Link Status Check: Passed") + else: + logger.warning(f"RDMA Link Status Check: Failed") + return link_issues + +def get_host_serial(): + # Run the shell command + if not is_user_root(): + result = subprocess.run(['sudo', 'dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) + else: + result = subprocess.run(['dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE) + + # Decode the output from bytes to string + output = result.stdout.decode('utf-8') + + # Return the serial number + return output.strip() + +def check_bus(): + # Check to see if any devices have fallen of the bus + command = ['lspci', '-v'] + result = subprocess.run(command, stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + lines = output.split('\n') + bus_issues = [] + for line in lines: + if line.find('(rev ff)') != -1: + bus_issues.append(line) + if len(bus_issues) > 0: + logger.error(f"Devices have fallen off the bus") + else: + logger.info("No devices have fallen off the bus") + if len(bus_issues) == 0: + logger.info("Bus Check Test: Passed") + return(bus_issues) + else: + logger.warning("Bus Check Test: Failed") + return(bus_issues) + +def check_gpu_count(): + + lspci_expected_results = [ '0f:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + '2d:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + '44:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + '5b:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + '89:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + 'a8:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + 'c0:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)', + 'd8:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)' + ] + + # Check the number of GPUs + try: + result = subprocess.run(['nvidia-smi', '--list-gpus'], stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + lines = output.split('\n') + tmp_results = [] + # remove empty lines + lines = [line for line in lines if line] + if len(lines) == 8: + logger.info("GPU Count Test: Passed") + else: + logger.warning("GPU Count Test: Failed") + tmp_results.append(f"Expected 8 GPUs, found {len(lines)} using nvidia-smi command") + return tmp_results + + except FileNotFoundError: + try: + # Check if lspci is available + result = subprocess.run(['lspci', '-v'], stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + + # Check if the expected results are in the output + lines = output.split('\n') + tmp_results = [] + missing_gpus = [] + for line in lines: + if line.find("NVIDIA") != -1 and line.find("2330") != -1: + tmp_results.append(line) + if not len(tmp_results) == 8: + logger.debug(f"Expected 8 GPUs, found {len(tmp_results)} in lspci output") + for line in lspci_expected_results: + if line not in tmp_results: + missing_gpus.append(f"Missing GPU: {line}") + if len(tmp_results) == 8: + logger.info("GPU Count Test: Passed") + else: + logger.warning("GPU Count Test: Failed") + return missing_gpus + except FileNotFoundError: + logger.warning("Skipping GPU count test: nvidia-smi and lspci commands not found") + return None + +def slurm_reason(message): + global slurm_drain_reason + global slurm_error_count + slurm_drain_reason+=(message+"\n") + slurm_error_count+=1 + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check H100 setup') + parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level default: INFO") + parser.add_argument('--bw-test', dest='bw_test', action='store_true', default=False, help='Run GPU bandwidth test (default: False)') + parser.add_argument('--bw-test-exe', dest='bw_test_exe', help='Location to cuda-sampels bandwidthTest') + parser.add_argument('--lf-interval', dest='lf_interval', default=6, type=int, help='Link flapping interval with no flapping or link down events (default: 6 (hours))') + parser.add_argument('-a','--all', dest='run_all', action='store_true', default=False, help='Run all checks (default: False)') + parser.add_argument('-slurm','--slurm', dest='slurm', action='store_true', default=False, help='Add a Slurm message') + args = parser.parse_args() + + logger.setLevel(args.log_level) + + datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S') + logger.info(f"Started H100 setup check at: {datetime_str}") + try: + oca_version = get_oca_version() + except Exception as e: + logger.warning(f"Failed to get Oracle Cloud Agent version with error: {e}") + oca_version = "Unknown" + try: + rttcc_issues = check_rttcc_status() + except Exception as e: + logger.warning(f"Failed to check RTTCC status with error: {e}") + rttcc_issues = [] + + # Check for ECC errors + try: + ecc_issues = check_ecc_errors() + except Exception as e: + logger.warning(f"Failed to check ECC errors with error: {e}") + ecc_issues = [] + + # Check for row remap errors + try: + remap_results = check_row_remap_errors() + except Exception as e: + logger.warning(f"Failed to check row remap errors with error: {e}") + remap_results = [] + + # Check RDMA link status + try: + rdma_link_issues = check_rdma_link_status() + except Exception as e: + logger.warning(f"Failed to check RDMA link status with error: {e}") + rdma_link_issues = [] + + # Check for RDMA link flapping + try: + lft = LinkFlappingTest(time_interval=args.lf_interval) + lft.get_rdma_link_failures() + lft_issues = lft.process_rdma_link_flapping() + except Exception as e: + logger.warning(f"Failed to check RDMA link flapping with error: {e}") + lft_issues = {"failures": [], "link_down": []} + + # Check for GPU Xid errors + try: + xc = XidChecker() + xid_results = xc.check_gpu_xid() + except Exception as e: + logger.warning(f"Failed to check GPU Xid errors with error: {e}") + xid_results = {"status": "None", "results": {}} + + # Check GPU bandwidth + bwt_results = None + try: + if args.bw_test == True or args.run_all == True: + if args.bw_test_exe: + bwt = BandwidthTest(bw_test_exe=args.bw_test_exe) + else: + bwt = BandwidthTest() + bwt.measure_gpu_bw() + bwt_results = bwt.validate_results() + except Exception as e: + logger.warning(f"Failed to check GPU bandwidth with error: {e}") + bwt_results = None + + # Check the bus + try: + bus_results = check_bus() + except Exception as e: + logger.warning(f"Failed to check the bus with error: {e}") + bus_results = None + + # Check the number of GPUs + try: + gpu_results = check_gpu_count() + except Exception as e: + logger.warning(f"Failed to check the number of GPUs with error: {e}") + gpu_results = None + + # Summarize the results + try: + host_serial = get_host_serial() + except Exception as e: + logger.warning(f"Failed to get host serial number with error: {e}") + host_serial = "Unknown" + + slurm_drain_reason = "" + slurm_error_count = 0 + + logger.info(f"--------- Summary of H100 setup check for {host_serial} ---------") + if oca_version < "1.39.0": + logger.error(f"Oracle Cloud Agent: {oca_version} needs to be updated to 1.39.0 or higher") + slurm_reason("OCA version Error") + if len(rttcc_issues) > 0: + logger.error(f"RTTCC issues: {rttcc_issues}") + slurm_reason("RTTCC Error") + if len(ecc_issues) > 0: + ecc_error=False + for issue in ecc_issues: + if "Skipped" in issue: + logger.warning(f"{host_serial} - {issue}") + else: + if "Aggregate" in issue: + logger.warning(f"{host_serial} - ECC issues: {issue}") + else: + logger.error(f"{host_serial} - ECC issues: {issue}") + ecc_error=True + if ecc_error: + slurm_reason("ECC Error") + if len(remap_results) > 0: + remap_error=False + for issue in remap_results: + if "<512" in issue: + logger.warning(f"{host_serial} - {issue}") + else: + logger.error(f"{host_serial} - {issue}") + remap_error=True + if remap_error: + slurm_reason("Remap Error") + if xid_results["status"] == "Failed": + for xid in xid_results["results"]: + for pci in xid_results["results"][xid]["results"]: + logger.error(f"{host_serial} - GPU Xid {xid} device: {pci}, {xid_results['results'][xid]['description']}") + slurm_reason("XID Error") + if len(rdma_link_issues) > 0: + for issue in rdma_link_issues: + logger.error(f"{host_serial} - RDMA link issues: {issue}") + slurm_reason("RDMA Link Error") + if len(lft_issues["failures"]) > 0 or len(lft_issues["link_down"]) > 0: + if len(lft_issues["failures"]) > 0: + for issue in lft_issues["failures"]: + logger.error(f"{host_serial} - RDMA link flapping issues: {issue}") + slurm_reason("RDMA Link Flapping Error") + if len(lft_issues["link_down"]) > 0: + for issue in lft_issues["link_down"]: + logger.error(f"{host_serial} - RDMA link down issues: {issue}") + slurm_reason("RDMA Link Down Error") + if bwt_results != None: + if bwt_results["status"] == "Failed": + for issue in bwt_results["issues"]: + logger.error(f"{host_serial} - GPU bandwidth issues: {issue}") + slurm_reason("GPU Bwt Error") + if bus_results: + logger.error(f"{host_serial} - Bus issues: {bus_results}") + slurm_reason("GPU Bus Error") + if gpu_results: + logger.error(f"{host_serial} - Missing GPU(s): {gpu_results}") + slurm_reason("Missing GPU Error") + + datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S') + logger.info(f"Finished H100 setup check at: {datetime_str}") + + if slurm_error_count > 0 and args.slurm: + print("Healthcheck:: "+slurm_drain_reason[:-1]) \ No newline at end of file diff --git a/playbooks/roles/healthchecks/files/gpu_bw_test.py b/playbooks/roles/healthchecks/files/gpu_bw_test.py new file mode 100644 index 00000000..369556c3 --- /dev/null +++ b/playbooks/roles/healthchecks/files/gpu_bw_test.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 + +import argparse +import subprocess +import os +import socket +import time +import json +from shared_logging import logger +import re + + +class BandwidthTest: + def __init__(self, iteration=1, size=32000000, bw_test_exe="/opt/oci-hpc/cuda-samples/bin/x86_64/linux/release/bandwidthTest"): + self.iteration = iteration + self.size = size + self.bw_test_exe = bw_test_exe + self.results = None + self.dtoh_threshold = 52.0 + self.htod_threshold = 52.0 + + def get_numa_nodes(self): + result = subprocess.run(['numactl', '-H'], stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + filtered_output = [line for line in output.split('\n') if line.startswith('available:')] + return int(filtered_output[0].split()[1].strip()) + + def get_gpus(self): + result = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE) + output = result.stdout.decode('utf-8') + filtered_output = [line for line in output.split('\n') if line.startswith('GPU')] + return len(filtered_output) + + def measure_gpu_bw(self): + numas = 2 + gpus = 8 + iterations = 1 + size = "32000000" + + gpus = self.get_gpus() + numas = self.get_numa_nodes() + gpus_per_numa = gpus // numas + + logger.debug("GPUs: {}".format(gpus)) + logger.debug("NUMAs: {}".format(numas)) + logger.debug("GPUs per NUMA: {}".format(gpus_per_numa)) + + logger.debug("Iteration: Device: DtoH : HtoD") + hostname = socket.gethostname() + results = {"gpus": {}, "host": hostname} + + # Check if any processes are running on the GPUs before running the test + result = subprocess.run(["nvidia-smi", "-q", "-d", "PIDS"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + # Define the regular expression pattern for the GPU ID and the Processes + pattern = r'\nGPU\s(.*)\s+Processes\s+:\s+(.*)' + + # Find all matches in the output + matches = re.findall(pattern, result.stdout) + + # For each match, extract the GPU ID and the number of processes + gpu_idle_count = 0 + for match in matches: + gpu_id, processes = match + # If processes is 'None', set it to 0 + if processes == 'None': + gpu_idle_count += 1 + else: + logger.debug("GPU {} has processes running on it".format(gpu_id)) + + + logger.debug("GPU Idle Count: {}".format(gpu_idle_count)) + if gpu_idle_count != 8: + logger.error("GPU processes are running on the host. Please make sure no processes are running on the GPU before you re-test") + self.results = None + return self.results + + for i in range(iterations): + for device in range(gpus): + os.environ["CUDA_VISIBLE_DEVICES"] = str(device) + logger.debug("ENV: {}".format(os.environ["CUDA_VISIBLE_DEVICES"])) + logger.debug("Iteration: {} Device: {} gpus_per_numa: {}".format(i, device, gpus_per_numa)) + logger.debug("CMD: {}".format(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"])) + result = subprocess.run(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + logger.debug("Output: {}".format(result.stdout)) + logger.debug("Error: {}".format(result.stderr)) + if result.stdout.find(size) != -1: + result = result.stdout.split("\n") + tmp = [x for x in result if size in x] + tmp = tmp[0].split() + dtoh = float(tmp[1]) + + result = subprocess.run(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-htod"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) + result = result.stdout.split("\n") + tmp = [x for x in result if size in x] + tmp = tmp[0].split() + htod = float(tmp[1]) + else: + dtoh = -1.0 + htod = -1.0 + + if device not in results["gpus"]: + results["gpus"][device] = {"dtoh": [dtoh], "htod": [htod]} + else: + results["gpus"][device]["dtoh"].append(dtoh) + results["gpus"][device]["htod"].append(htod) + + logger.debug(str(i) + " : " +str(device) + " : " + str(dtoh) + " : " + str(htod)) + + if i > 1 and i != iterations - 1: + # Sleep for 5 seconds and rerun + time.sleep(5) + + logger.debug(json.dumps(results)) + self.results = results + + def validate_results(self): + gpu_issues = {"status": "Passed", "issues": []} + if self.results == None: + gpu_issues["issues"].append("GPU bandwidth test did not run since processes are running on the GPU") + gpu_issues["status"] = "Failed" + return gpu_issues + status = True + for device in self.results["gpus"]: + dtoh = self.results["gpus"][device]["dtoh"] + htod = self.results["gpus"][device]["htod"] + dtoh_avg = sum(dtoh) / len(dtoh) + htod_avg = sum(htod) / len(htod) + logger.debug("Device: {} DtoH: {} HtoD: {}".format(device, dtoh_avg, htod_avg)) + if dtoh_avg < self.dtoh_threshold: + logger.debug("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold)) + gpu_issues["issues"].append("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold)) + gpu_issues["status"] = "Failed" + if htod_avg < self.htod_threshold: + logger.debug("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold)) + gpu_issues["issues"].append("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold)) + gpu_issues["status"] = "Failed" + if gpu_issues["status"] == "Passed": + logger.info("GPU bandwidth test passed") + return gpu_issues + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run GPU bandwidth test') + parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level default: INFO") + parser.add_argument('-i', dest='iterations', default='1', help='Number of iterations to run Ex. -i 3') + parser.add_argument('-s', dest='size', default='32000000', help='Message size to run Ex. -s 32000000') + parser.add_argument('--bw-test-exe', dest='bw_test_exe', default='/opt/oci-hpc/cuda-samples/bin/x86_64/linux/release/bandwidthTest', help='Path to the bw_test executable') + args = parser.parse_args() + + logger.setLevel(args.log_level) + if args.iterations != 'NONE': + iterations = int(args.iterations) + if args.size != 'NONE': + size = args.size + if args.bw_test_exe != 'NONE': + bw_test_exe = args.bw_test_exe + + bwt = BandwidthTest(iteration=iterations, size=size, bw_test_exe=bw_test_exe) + bwt.measure_gpu_bw() + bwt_results = bwt.validate_results() + if bwt_results["status"] == "Passed": + logger.info("GPU bandwidth test passed") + else: + logger.error("GPU bandwidth test failed") + for issue in bwt_results["issues"]: + logger.error(issue) + diff --git a/playbooks/roles/healthchecks/files/rdma_link_flapping.py b/playbooks/roles/healthchecks/files/rdma_link_flapping.py new file mode 100644 index 00000000..425ec54e --- /dev/null +++ b/playbooks/roles/healthchecks/files/rdma_link_flapping.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 + +import os +import sys +import time +import datetime +import re +import argparse +import socket +import subprocess +from shared_logging import logger + + +class LinkFlappingTest: + def __init__(self, time_interval=6): + self.results = None + self.time_interval = int(time_interval) + self.link_data = None + + + # Check if the log file exists + msg_file = "/var/log/messages" + if not os.path.exists(msg_file): + msg_file = "/var/log/syslog" + self.log_file = msg_file + + def get_rdma_link_failures(self): + + pattern = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+wpa_supplicant(?:\[\d+\])?: (\w+): CTRL-EVENT-EAP-FAILURE EAP authentication failed" + pattern2 = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+kernel: (?:\[\d+\.\d+\]\s)?mlx5_core \S+ (\w+): Link down" + + self.link_data = {} + with open(self.log_file, "r") as f: + for line in f: + match = re.search(pattern, line) + if match: + time_str = match.group(1) + interface = match.group(2) + logger.debug(f"time: {time_str}, interface: {interface}") + if interface not in self.link_data: + self.link_data[interface] = {"failures": [time_str], "link_down": []} + else: + self.link_data[interface]["failures"].append(time_str) + + + match = re.search(pattern2, line) + if match: + time_str = match.group(1) + interface = match.group(2) + logger.debug(f"time: {time_str}, interface: {interface}") + if interface not in self.link_data: + self.link_data[interface] = {"failures": [], "link_down": [time_str]} + else: + self.link_data[interface]["link_down"].append(time_str) + + logger.debug("Link Data: {}".format(self.link_data)) + return self.link_data + + def process_rdma_link_flapping(self): + + link_issues = {"failures": [], "link_down": []} + + # Get the time stamp when the host came up + bootup_time = subprocess.run(['uptime', '-s'], stdout=subprocess.PIPE) + bootup_time = bootup_time.stdout.decode('utf-8').strip() + bootup_time_str = datetime.datetime.strptime(bootup_time, "%Y-%m-%d %H:%M:%S") + bootup_time_sec = int(time.mktime(bootup_time_str.timetuple())) + bootup_time_grace_period = bootup_time_sec + 1800 + + status = 0 + if len(self.link_data) >= 0: + current_date = datetime.datetime.now() + current_date_str = current_date.strftime("%Y-%b-%d %H:%M:%S") + current_date_sec = int(time.mktime(datetime.datetime.strptime(current_date_str, "%Y-%b-%d %H:%M:%S").timetuple())) + + link_failures = False + for interface in self.link_data: + if len(self.link_data[interface]["failures"]) > 0: + link_failures = True + logger.debug(f"{interface}: {len(self.link_data[interface]['failures'])} RDMA link failure entries in {self.log_file}") + logger.debug(f"{interface}: {self.link_data[interface]['failures']}") + last_date_failure_str = None + + if len(self.link_data[interface]["failures"]) > 0: + last_date_failure_str = self.link_data[interface]["failures"][-1] + last_date_failure = datetime.datetime.strptime(last_date_failure_str, "%b %d %H:%M:%S") + + # Compare the month of the last failure date with the current month + if last_date_failure.month > current_date.month: + # If the last failure month is greater than the current month, subtract one from the current year + last_date_failure = last_date_failure.replace(year=current_date.year - 1) + else: + # Otherwise, set the year of the last failure date to the current year + last_date_failure = last_date_failure.replace(year=current_date.year) + + # Convert the last failure date to seconds since the epoch + last_date_failure_sec = int(time.mktime(last_date_failure.timetuple())) + + if last_date_failure_str != None and last_date_failure_str != current_date_str: + diff_secs = current_date_sec - last_date_failure_sec + diff_hours = diff_secs // (60 * 60) + logger.debug(f"RDMA link ({interface}) failed {diff_hours} hours ago") + + logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") + if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period: + logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})") + link_issues["failures"].append(f"{interface}: {len(self.link_data[interface]['failures'])}") + status = -1 + + for interface in self.link_data: + if len(self.link_data[interface]["link_down"]) > 0: + logger.debug(f"{interface}: {len(self.link_data[interface]['link_down'])} RDMA link down entries in {self.log_file}") + logger.debug(f"{interface}: {self.link_data[interface]['link_down']}") + last_date_down_str = None + + if len(self.link_data[interface]["link_down"]) > 0: + last_date_down_str = self.link_data[interface]["link_down"][-1] + last_date_down = datetime.datetime.strptime(last_date_down_str, "%b %d %H:%M:%S") + + # Compare the month of the last failure date with the current month + if last_date_down.month > current_date.month: + # If the last failure month is greater than the current month, subtract one from the current year + last_date_down = last_date_down.replace(year=current_date.year - 1) + else: + # Otherwise, set the year of the last failure date to the current year + last_date_down = last_date_down.replace(year=current_date.year) + + # Convert the last failure date to seconds since the epoch + last_date_down_sec = int(time.mktime(last_date_down.timetuple())) + + + if last_date_down_str != None and last_date_down_str != current_date_str: + diff_secs = current_date_sec - last_date_down_sec + diff_hours = diff_secs // (60 * 60) + logger.debug(f"RDMA link ({interface}) down {diff_hours} hours ago") + + logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") + if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period: + logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}") + link_issues["link_down"].append(f"{interface}: {len(self.link_data[interface]['link_down'])}") + status = -2 + if status == -1: + logger.debug(f"One or more RDMA link flapping events within the past {self.time_interval} hours") + if status == -2: + logger.debug(f"One or more RDMA link down events within the past {self.time_interval} hours") + + else: + logger.info("No RDMA link failures entry in /var/log/messages") + if status == 0: + logger.info("RDMA link flapping/down test: Passed") + else: + logger.warning("RDMA link flapping/down test: Failed") + return link_issues + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Process RDMA link flapping data") + parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level") + args = parser.parse_args() + + logger.setLevel(args.log_level) + + auth_failure_file = "/tmp/last_auth_failure_date" + msg_file = "/var/log/messages" + if not os.path.exists(msg_file): + msg_file = "/var/log/syslog" + time_interval_hours = 6 + lft = LinkFlappingTest(time_interval=time_interval_hours) + link_data = lft.get_rdma_link_failures() + lft.process_rdma_link_flapping() diff --git a/playbooks/roles/healthchecks/files/shared_logging.py b/playbooks/roles/healthchecks/files/shared_logging.py new file mode 100644 index 00000000..af87bc2d --- /dev/null +++ b/playbooks/roles/healthchecks/files/shared_logging.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python3 + +import logging +logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger('nhc') diff --git a/playbooks/roles/healthchecks/files/xid_checker.py b/playbooks/roles/healthchecks/files/xid_checker.py new file mode 100644 index 00000000..eaa6360b --- /dev/null +++ b/playbooks/roles/healthchecks/files/xid_checker.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 + +import argparse +from shared_logging import logger +import subprocess +import sys +import re + +class XidChecker: + def __init__(self, dmesg_cmd="dmesg", time_interval=60): + self.dmesg_cmd = dmesg_cmd + self.results = {} + + + # Check for the following GPU Xid errors in dmesg + self.XID_EC = { + "1": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "2": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "3": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "4": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "5": {"description": "Unused", "severity": "Critical"}, + "6": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "7": {"description": "Invalid or corrupted push buffer address", "severity": "Critical"}, + "8": {"description": "GPU stopped processing", "severity": "Critical"}, + "9": {"description": "Driver error programming GPU", "severity": "Critical"}, + "10": {"description": "Unused", "severity": "Critical"}, + "11": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"}, + "12": {"description": "Driver error handling GPU exception", "severity": "Critical"}, + "13": {"description": "Graphics Engine Exception", "severity": "Critical"}, + "14": {"description": "Unused", "severity": "Warn"}, + "15": {"description": "Unused", "severity": "Warn"}, + "16": {"description": "Display engine hung", "severity": "Warn"}, + "17": {"description": "Unused", "severity": "Warn"}, + "18": {"description": "Bus mastering disabled in PCI Config Space", "severity": "Warn"}, + "19": {"description": "Display Engine error", "severity": "Warn"}, + "20": {"description": "Invalid or corrupted Mpeg push buffer", "severity": "Warn"}, + "21": {"description": "Invalid or corrupted Motion Estimation push buffer", "severity": "Warn"}, + "22": {"description": "Invalid or corrupted Video Processor push buffer", "severity": "Warn"}, + "23": {"description": "Unused", "severity": "Warn"}, + "24": {"description": "GPU semaphore timeout", "severity": "Warn"}, + "25": {"description": "Invalid or illegal push buffer stream", "severity": "Warn"}, + "26": {"description": "Framebuffer timeout", "severity": "Warn"}, + "27": {"description": "Video processor exception", "severity": "Warn"}, + "28": {"description": "Video processor exception", "severity": "Warn"}, + "29": {"description": "Video processor exception", "severity": "Warn"}, + "30": {"description": "GPU semaphore access error", "severity": "Warn"}, + "31": {"description": "GPU memory page fault", "severity": "Critical"}, + "32": {"description": "Invalid or corrupted push buffer stream", "severity": "Warn"}, + "33": {"description": "Internal micro-controller error", "severity": "Warn"}, + "34": {"description": "Video processor exception", "severity": "Warn"}, + "35": {"description": "Video processor exception", "severity": "Warn"}, + "36": {"description": "Video processor exception", "severity": "Warn"}, + "37": {"description": "Driver firmware error", "severity": "Warn"}, + "38": {"description": "Driver firmware error", "severity": "Warn"}, + "39": {"description": "Unused", "severity": "Warn"}, + "40": {"description": "Unused", "severity": "Warn"}, + "41": {"description": "Unused", "severity": "Warn"}, + "42": {"description": "Video processor exception", "severity": "Warn"}, + "43": {"description": "GPU stopped processing", "severity": "Warn"}, + "44": {"description": "Graphics Engine fault during context switch", "severity": "Warn"}, + "45": {"description": "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE", "severity": "Warn"}, + "46": {"description": "GPU stopped processing", "severity": "Warn"}, + "47": {"description": "Video processor exception", "severity": "Warn"}, + "48": {"description": "Double Bit ECC Error", "severity": "Critical"}, + "49": {"description": "Unused", "severity": "Warn"}, + "50": {"description": "Unused", "severity": "Warn"}, + "51": {"description": "Unused", "severity": "Warn"}, + "52": {"description": "Unused", "severity": "Warn"}, + "53": {"description": "Unused", "severity": "Warn"}, + "54": {"description": "Auxiliary power is not connected to the GPU board", "severity": "Warn"}, + "55": {"description": "Unused", "severity": "Warn"}, + "56": {"description": "Display Engine error", "severity": "Critical"}, + "57": {"description": "Error programming video memory interface", "severity": "Critical"}, + "58": {"description": "Unstable video memory interface detected", "severity": "Critical"}, + "59": {"description": "Internal micro-controller error (older drivers)", "severity": "Warn"}, + "60": {"description": "Video processor exception", "severity": "Warn"}, + "61": {"description": "Internal micro-controller breakpoint/warning (newer drivers)", "severity": "Warn"}, + "62": {"description": "Internal micro-controller halt", "severity": "Critical"}, + "63": {"description": "ECC page retirement or row remapping recording event", "severity": "Critical"}, + "64": {"description": "ECC page retirement or row remapper recording failure", "severity": "Critical"}, + "65": {"description": "Video processor exception", "severity": "Critical"}, + "66": {"description": "Illegal access by driver", "severity": "Warn"}, + "67": {"description": "Illegal access by driver", "severity": "Warn"}, + "68": {"description": "NVDEC0 Exception", "severity": "Critical"}, + "69": {"description": "Graphics Engine class error", "severity": "Critical"}, + "70": {"description": "CE3: Unknown Error", "severity": "Warn"}, + "71": {"description": "CE4: Unknown Error", "severity": "Warn"}, + "72": {"description": "CE5: Unknown Error", "severity": "Warn"}, + "73": {"description": "NVENC2 Error", "severity": "Critical"}, + "74": {"description": "NVLINK Error", "severity": "Critical"}, + "75": {"description": "CE6: Unknown Error", "severity": "Warn"}, + "76": {"description": "CE7: Unknown Error", "severity": "Warn"}, + "77": {"description": "CE8: Unknown Error", "severity": "Warn"}, + "78": {"description": "vGPU Start Error", "severity": "Warn"}, + "79": {"description": "GPU has fallen off the bus", "severity": "Critical"}, + "80": {"description": "Corrupted data sent to GPU", "severity": "Critical"}, + "81": {"description": "VGA Subsystem Error", "severity": "Critical"}, + "82": {"description": "NVJPGO Error", "severity": "Warn"}, + "83": {"description": "NVDEC1 Error", "severity": "Warn"}, + "84": {"description": "NVDEC2 Error", "severity": "Warn"}, + "85": {"description": "CE9: Unknown Error", "severity": "Warn"}, + "86": {"description": "OFA Exception", "severity": "Warn"}, + "87": {"description": "Reserved", "severity": "Warn"}, + "88": {"description": "NVDEC3 Error", "severity": "Warn"}, + "89": {"description": "NVDEC4 Error", "severity": "Warn"}, + "90": {"description": "Reserved", "severity": "Warn"}, + "91": {"description": "Reserved", "severity": "Warn"}, + "92": {"description": "High single-bit ECC error rate", "severity": "Critical"}, + "93": {"description": "Non-fatal violation of provisioned InfoROM wear limit", "severity": "Warn"}, + "94": {"description": "Contained ECC error", "severity": "Critical"}, + "95": {"description": "Uncontained ECC error", "severity": "Critical"}, + "96": {"description": "NVDEC5 Error", "severity": "Warn"}, + "97": {"description": "NVDEC6 Error", "severity": "Warn"}, + "98": {"description": "NVDEC7 Error", "severity": "Warn"}, + "99": {"description": "NVJPG1 Error", "severity": "Warn"}, + "100": {"description": "NVJPG2 Error", "severity": "Warn"}, + "101": {"description": "NVJPG3 Error", "severity": "Warn"}, + "102": {"description": "NVJPG4 Error", "severity": "Warn"}, + "103": {"description": "NVJPG5 Error", "severity": "Warn"}, + "104": {"description": "NVJPG6 Error", "severity": "Warn"}, + "105": {"description": "NVJPG7 Error", "severity": "Warn"}, + "106": {"description": "SMBPBI Test Message", "severity": "Warn"}, + "107": {"description": "SMBPBI Test Message Silent", "severity": "Warn"}, + "108": {"description": "Reserved", "severity": "Warn"}, + "109": {"description": "Context Switch Timeout Error", "severity": "Critical"}, + "110": {"description": "Security Fault Error", "severity": "Warn"}, + "111": {"description": "Display Bundle Error Event", "severity": "Warn"}, + "112": {"description": "Display Supervisor Error", "severity": "Warn"}, + "113": {"description": "DP Link Training Error", "severity": "Warn"}, + "114": {"description": "Display Pipeline Underflow Error", "severity": "Warn"}, + "115": {"description": "Display Core Channel Error", "severity": "Warn"}, + "116": {"description": "Display Window Channel Error", "severity": "Warn"}, + "117": {"description": "Display Cursor Channel Error", "severity": "Warn"}, + "118": {"description": "Display Pixel Pipeline Error", "severity": "Warn"}, + "119": {"description": "GSP RPC Timeout", "severity": "Critical"}, + "120": {"description": "GSP Error", "severity": "Critical"}, + "121": {"description": "C2C Link Error", "severity": "Critical"}, + "122": {"description": "SPI PMU RPC Read Failure", "severity": "Warn"}, + "123": {"description": "SPI PMU RPC Write Failure", "severity": "Warn"}, + "124": {"description": "SPI PMU RPC Erase Failure", "severity": "Warn"}, + "125": {"description": "Inforom FS Failure", "severity": "Warn"}, + "126": {"description": "Reserved", "severity": "Warn"}, + "127": {"description": "Reserved", "severity": "Warn"}, + "128": {"description": "Reserved", "severity": "Warn"}, + "129": {"description": "Reserved", "severity": "Warn"}, + "130": {"description": "Reserved", "severity": "Warn"}, + "131": {"description": "Reserved", "severity": "Warn"}, + "132": {"description": "Reserved", "severity": "Warn"}, + "133": {"description": "Reserved", "severity": "Warn"}, + "134": {"description": "Reserved", "severity": "Warn"}, + "135": {"description": "Reserved", "severity": "Warn"}, + "136": {"description": "Reserved", "severity": "Warn"}, + "137": {"description": "Reserved", "severity": "Warn"}, + "138": {"description": "Reserved", "severity": "Warn"}, + "139": {"description": "Reserved", "severity": "Warn"}, + "140": {"description": "Unrecovered ECC Error", "severity": "Warn"}, + "141": {"description": "Reserved", "severity": "Warn"}, + "142": {"description": "Reserved", "severity": "Warn"}, + "143": {"description": "GPU Initialization Failure", "severity": "Warn"} + } + + def check_gpu_xid(self): + status = "Pass" + dmesg_output = subprocess.check_output([self.dmesg_cmd]).decode("utf-8") + if "NVRM: Xid" in dmesg_output: + for XID in self.XID_EC.keys(): + logger.debug(f"Checking for GPU Xid {XID} error in dmesg") + + matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", dmesg_output) + tmp_dict = {} + for match in matches: + if match not in tmp_dict: + tmp_dict[match] = 1 + else: + tmp_dict[match] = tmp_dict[match] + 1 + for x in tmp_dict.keys(): + logger.info(f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}") + if not matches: + logger.debug(f"No GPU Xid {XID} error found in dmesg") + if tmp_dict != {}: + if self.XID_EC[XID]['severity'] == "Critical": + status = "Failed" + self.results[XID] = {"results": tmp_dict, "description": self.XID_EC[XID]['description']} + else: + logger.info("Xid Check: Passed") + return {"status": status, "results": self.results} + + +if __name__ == '__main__': + # Argument parsing + parser = argparse.ArgumentParser(description='Check for GPU Xid errors.') + parser.add_argument('--dmesg_cmd', default='dmesg', help='Dmesg file to check. Default is dmesg.') + args = parser.parse_args() + + + logger.debug(f"Using dmesg command: {args.dmesg_cmd}") + + xc = XidChecker(dmesg_cmd=args.dmesg_cmd) + results = xc.check_gpu_xid() + logger.debug("Status: {}, Results: {}".format(results["status"], results["results"])) diff --git a/playbooks/roles/healthchecks/tasks/main.yml b/playbooks/roles/healthchecks/tasks/main.yml new file mode 100755 index 00000000..7ed13754 --- /dev/null +++ b/playbooks/roles/healthchecks/tasks/main.yml @@ -0,0 +1,22 @@ +- name: Create systemd unit dirs + become: true + file: + name: '/opt/oci-hpc/healthchecks' + state: directory + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' + +- name: Copy files + become: true + copy: + src: '{{ item }}' + dest: '/opt/oci-hpc/healthchecks/{{ item }}' + force: no + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' + with_items: + - check_h100_setup.py + - gpu_bw_test.py + - rdma_link_flapping.py + - xid_checker.py + - shared_logging.py \ No newline at end of file diff --git a/playbooks/roles/slurm/files/healthchecks.sh b/playbooks/roles/slurm/files/healthchecks.sh new file mode 100644 index 00000000..a2a9cec0 --- /dev/null +++ b/playbooks/roles/slurm/files/healthchecks.sh @@ -0,0 +1,11 @@ +#!/bin/sh +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ "${shape}" = \"BM.GPU.H100.8\" ] +then + sudo python3 /opt/oci-hpc/healthchecks/check_h100_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1 + DRAIN_MSG=`cat /tmp/latest_healthcheck.log | grep "Healthcheck::"` + if [ "$DRAIN_MSG" != "" ] + then + scontrol update nodename=`hostname` state=drain reason="${DRAIN_MSG}" + fi +fi diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 9f0d0729..755bc51f 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -192,4 +192,22 @@ - name: Include pyxis prolog files include_tasks: common_pmix.yml - when: ansible_os_family == 'RedHat' \ No newline at end of file + when: ansible_os_family == 'RedHat' + +- name: Ensure prolog directory exists + become: true + file: + path: "{{ slurm_conf_path }}/prolog.d" + state: directory + owner: root + group: root + when: healthchecks|bool + +- name: copy healthchecks + become: true + copy: + src: healthchecks.sh + dest: "{{ slurm_conf_path }}/prolog.d/healthchecks.sh" + owner: root + group: root + mode: '0755' \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/common_pyxis.yml b/playbooks/roles/slurm/tasks/common_pyxis.yml index a200ad54..dbad6c54 100644 --- a/playbooks/roles/slurm/tasks/common_pyxis.yml +++ b/playbooks/roles/slurm/tasks/common_pyxis.yml @@ -7,13 +7,21 @@ - set_fact: enroot_top_path_checked: "{{enroot_top_path}}" when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + +- name: Ensure prolog directory exists + become: true + file: + path: "{{ slurm_conf_path }}/prolog.d" + state: directory + owner: root + group: root - name: copy files become: true become_method: sudo template: - src: prolog.sh.j2 - dest: "{{ slurm_conf_path }}/prolog.sh" + src: pyxis.sh.j2 + dest: "{{ slurm_conf_path }}/prolog.d/pyxis.sh" owner: root group: root mode: 0755 diff --git a/playbooks/roles/slurm/templates/prolog.sh.j2 b/playbooks/roles/slurm/templates/pyxis.sh.j2 similarity index 100% rename from playbooks/roles/slurm/templates/prolog.sh.j2 rename to playbooks/roles/slurm/templates/pyxis.sh.j2 diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 3bb57372..19bf5502 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -12,8 +12,8 @@ SlurmdPidFile=/var/run/slurmd.pid SlurmdPort=6818 SlurmdSpoolDir=/var/spool/slurmd SlurmUser=slurm -{% if pyxis|bool %} -Prolog=/etc/slurm/prolog.sh +{% if pyxis|bool or healthchecks|bool%} +Prolog=/etc/slurm/prolog.d/* {% endif %} SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log diff --git a/playbooks/site.yml b/playbooks/site.yml index e869fd36..72cfcca0 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -64,6 +64,8 @@ when: cluster_network|bool and not use_compute_agent|default(false)|bool - include_role: name: nvidia_peermem + - include_role: + name: healthchecks - hosts: controller become: true diff --git a/schema.yaml b/schema.yaml index e48577fd..033117fd 100755 --- a/schema.yaml +++ b/schema.yaml @@ -181,6 +181,7 @@ variableGroups: - ${pyxis} - ${pam} - ${sacct_limits} + - ${healthchecks} - title: "Hidden" variables: @@ -1196,6 +1197,13 @@ variables: description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs if the right limits are not set" visible: ${slurm} + healthchecks: + type: boolean + title: "Turn on Healthchecks for GPU nodes" + default: true + description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state" + visible: ${slurm} + monitoring: type: boolean title: "Install HPC Cluster Monitoring Tools" diff --git a/slurm_ha.tf b/slurm_ha.tf index 36dc60db..b6d9f72a 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -251,7 +251,8 @@ resource "null_resource" "cluster_backup" { region = var.region, tenancy_ocid = var.tenancy_ocid, api_fingerprint = var.api_fingerprint, - api_user_ocid = var.api_user_ocid + api_user_ocid = var.api_user_ocid, + healthchecks = var.healthchecks }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -399,7 +400,8 @@ resource "null_resource" "cluster_backup" { virt_instr = var.virt_instr, access_ctrl = var.access_ctrl, numa_nodes_per_socket = var.numa_nodes_per_socket, - percentage_of_cores_enabled = var.percentage_of_cores_enabled + percentage_of_cores_enabled = var.percentage_of_cores_enabled, + healthchecks = var.healthchecks }) destination = "/opt/oci-hpc/conf/variables.tf" diff --git a/variables.tf b/variables.tf index 0cc7e5df..2dd46e16 100755 --- a/variables.tf +++ b/variables.tf @@ -261,7 +261,9 @@ variable "zone_name" { variable "dns_entries" { default = true } - +variable "healthchecks" { + default = true +} variable "BIOS" { default = false } From fef6533274904b494c8f4842c0fbab3171ed7fae Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 26 Mar 2024 22:30:43 -0600 Subject: [PATCH 09/36] Add healthcheck every 5 minutes on all idle nodes --- playbooks/roles/slurm/templates/slurm.conf.j2 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 19bf5502..0ea9259f 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -46,7 +46,11 @@ TopologyPlugin=topology/tree TopologyParam=SwitchAsNodeRank TreeWidth=2048 SlurmctldParameters=enable_configless - +{% if healthchecks|bool %} +HealthCheckProgram=/etc/slurm/prolog.d/healthchecks.sh +HealthCheckInterval=300 +HealthCheckNodeState=NONDRAINED_IDLE,CYCLE +{% endif %} {% if sacct_limits|bool %} AccountingStorageTRES=gres/gpu AccountingStorageEnforce=limits,associations,qos,safe From e3418121958c48b16ce3dcd7ac4bd1e7dc620fe2 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:01:26 -0600 Subject: [PATCH 10/36] Fix SRAM errors on drivers 535.161.07 and above --- playbooks/roles/healthchecks/files/check_h100_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_h100_setup.py index 99b0498d..408e9f8a 100644 --- a/playbooks/roles/healthchecks/files/check_h100_setup.py +++ b/playbooks/roles/healthchecks/files/check_h100_setup.py @@ -114,7 +114,7 @@ def check_ecc_errors(): # Find the lines containing "SRAM Correctable" and "DRAM Correctable" sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output) if len(sram_matches)==0: - sram_matches = re.findall(r'SRAM Uncorrectable SEC-DED\s+:\s+(\d+)', output) + sram_matches = re.findall(r'SRAM Uncorrectable Parity\s+:\s+(\d+)', output) dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output) gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output) vol_sram_line = sram_matches[0::2] From 90a1fb063f83721796324f444678009749f8bd0e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:42:40 -0600 Subject: [PATCH 11/36] Remove old images versions --- conf/variables.tpl | 8 +------- variables.tf | 6 +----- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index c8bde956..d8b0047f 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -53,10 +53,6 @@ variable "marketplace_listing" { variable "marketplace_version_id" { type = map(string) default = { - "1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607" - "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" - "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" - "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0" @@ -69,9 +65,7 @@ variable "marketplace_version_id" { # To find the Appcatalog OCID, run # oci compute pic listing list --display-name "Oracle Linux 7 - HPC Cluster Networking Image" -variable "old_marketplace_listing_id" { - default = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa" -} + variable "marketplace_listing_id_HPC" { default = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq" } diff --git a/variables.tf b/variables.tf index 2dd46e16..c6067f23 100755 --- a/variables.tf +++ b/variables.tf @@ -86,11 +86,7 @@ variable "marketplace_listing" { } variable "marketplace_version_id" { type = map(string) - default = { - "1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607" - "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" - "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" - "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" + default = { "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0" From 936e11e36b71fa10e2d2011eaa411443c46259c3 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:44:21 -0600 Subject: [PATCH 12/36] Fix RTTCC Check on A100 Blocks --- playbooks/roles/healthchecks/files/check_h100_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_h100_setup.py index 408e9f8a..d61e8b6e 100644 --- a/playbooks/roles/healthchecks/files/check_h100_setup.py +++ b/playbooks/roles/healthchecks/files/check_h100_setup.py @@ -77,7 +77,7 @@ def check_rttcc_status(): command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] else: command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0'] - result = subprocess.run(command, stdout=subprocess.PIPE) + result = subprocess.run(command, stdout=subprocess.PIPE,stderr=subprocess.PIPE) output = result.stdout.decode('utf-8') filtered_output = [line for line in output.split('\n') if line.startswith('value')] for line in filtered_output: From 42fc627cd72ae82c3aefca5de2f98dabf7dea163 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 27 Mar 2024 11:49:14 -0600 Subject: [PATCH 13/36] Make healthchecks check valid for A100 --- .../files/{check_h100_setup.py => check_gpu_setup.py} | 6 +++--- playbooks/roles/healthchecks/tasks/main.yml | 2 +- playbooks/roles/slurm/files/healthchecks.sh | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) rename playbooks/roles/healthchecks/files/{check_h100_setup.py => check_gpu_setup.py} (99%) diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py similarity index 99% rename from playbooks/roles/healthchecks/files/check_h100_setup.py rename to playbooks/roles/healthchecks/files/check_gpu_setup.py index d61e8b6e..eb904a9f 100644 --- a/playbooks/roles/healthchecks/files/check_h100_setup.py +++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py @@ -340,7 +340,7 @@ def slurm_reason(message): slurm_error_count+=1 if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Check H100 setup') + parser = argparse.ArgumentParser(description='Check Host setup') parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level default: INFO") parser.add_argument('--bw-test', dest='bw_test', action='store_true', default=False, help='Run GPU bandwidth test (default: False)') parser.add_argument('--bw-test-exe', dest='bw_test_exe', help='Location to cuda-sampels bandwidthTest') @@ -352,7 +352,7 @@ def slurm_reason(message): logger.setLevel(args.log_level) datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S') - logger.info(f"Started H100 setup check at: {datetime_str}") + logger.info(f"Started GPU host setup check at: {datetime_str}") try: oca_version = get_oca_version() except Exception as e: @@ -440,7 +440,7 @@ def slurm_reason(message): slurm_drain_reason = "" slurm_error_count = 0 - logger.info(f"--------- Summary of H100 setup check for {host_serial} ---------") + logger.info(f"--------- Summary of Host setup check for {host_serial} ---------") if oca_version < "1.39.0": logger.error(f"Oracle Cloud Agent: {oca_version} needs to be updated to 1.39.0 or higher") slurm_reason("OCA version Error") diff --git a/playbooks/roles/healthchecks/tasks/main.yml b/playbooks/roles/healthchecks/tasks/main.yml index 7ed13754..d42e3997 100755 --- a/playbooks/roles/healthchecks/tasks/main.yml +++ b/playbooks/roles/healthchecks/tasks/main.yml @@ -15,7 +15,7 @@ owner: '{{ ansible_user }}' group: '{{ ansible_user }}' with_items: - - check_h100_setup.py + - check_gpu_setup.py - gpu_bw_test.py - rdma_link_flapping.py - xid_checker.py diff --git a/playbooks/roles/slurm/files/healthchecks.sh b/playbooks/roles/slurm/files/healthchecks.sh index a2a9cec0..d54dd837 100644 --- a/playbooks/roles/slurm/files/healthchecks.sh +++ b/playbooks/roles/slurm/files/healthchecks.sh @@ -1,8 +1,8 @@ #!/bin/sh shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` -if [ "${shape}" = \"BM.GPU.H100.8\" ] +if [ "${shape}" = \"BM.GPU.H100.8\" ] || [ "${shape}" == \"BM.GPU.A100-v2.8\" ] || [ "${shape}" == \"BM.GPU4.8\" ] || [ "${shape}" == \"BM.GPU.B4.8\" ] then - sudo python3 /opt/oci-hpc/healthchecks/check_h100_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1 + sudo python3 /opt/oci-hpc/healthchecks/check_gpu_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1 DRAIN_MSG=`cat /tmp/latest_healthcheck.log | grep "Healthcheck::"` if [ "$DRAIN_MSG" != "" ] then From c32e9064c3688b974b70edbc7c641758e36c7edb Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 27 Mar 2024 15:33:21 -0600 Subject: [PATCH 14/36] Check the right device per shape --- .../healthchecks/files/check_gpu_setup.py | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py index eb904a9f..1aac93a3 100644 --- a/playbooks/roles/healthchecks/files/check_gpu_setup.py +++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py @@ -10,7 +10,15 @@ from xid_checker import XidChecker import platform import os -import sys +import requests + +def get_metadata(): + """ Make a request to metadata endpoint """ + headers = { 'Authorization' : 'Bearer Oracle' } + metadata_url = "http://169.254.169.254/opc/" + metadata_ver = "2" + request_url = metadata_url + "v" + metadata_ver + "/instance/" + return requests.get(request_url, headers=headers).json() def is_user_root(): # Check if the user is root @@ -189,8 +197,14 @@ def check_row_remap_errors(): def check_rdma_link_status(): status = True - devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] - + metadata=get_metadata() + shape=metadata['shape'] + if shape == "BM.GPU.H100.8": + devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif shape == "BM.GPU.B4.8" or shape == "BM.GPU.A100-v2.8": + devices = ["mlx5_1", "mlx5_2", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] + elif shape == "BM.GPU.4.8": + devices = ["mlx5_0", "mlx5_1", "mlx5_2", "mlx5_3", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] link_issues = [] for device in devices: # Run the mlxlink command @@ -501,7 +515,7 @@ def slurm_reason(message): slurm_reason("Missing GPU Error") datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S') - logger.info(f"Finished H100 setup check at: {datetime_str}") + logger.info(f"Finished GPU host setup check at: {datetime_str}") if slurm_error_count > 0 and args.slurm: print("Healthcheck:: "+slurm_drain_reason[:-1]) \ No newline at end of file From b682f68789b9b9435b1ef264c43fa464a7962b5f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Thu, 11 Apr 2024 16:21:57 -0600 Subject: [PATCH 15/36] Correct GPU4.8 Shape in Healthcheck --- playbooks/roles/healthchecks/files/check_gpu_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py index 1aac93a3..a57703e4 100644 --- a/playbooks/roles/healthchecks/files/check_gpu_setup.py +++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py @@ -203,7 +203,7 @@ def check_rdma_link_status(): devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] elif shape == "BM.GPU.B4.8" or shape == "BM.GPU.A100-v2.8": devices = ["mlx5_1", "mlx5_2", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] - elif shape == "BM.GPU.4.8": + elif shape == "BM.GPU4.8": devices = ["mlx5_0", "mlx5_1", "mlx5_2", "mlx5_3", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"] link_issues = [] for device in devices: From e01daae32b3c14823a3c368ca381f46c6b04d428 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Thu, 11 Apr 2024 16:55:19 -0600 Subject: [PATCH 16/36] Fix link flapping if the flapping happened close to boot --- playbooks/roles/healthchecks/files/rdma_link_flapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/healthchecks/files/rdma_link_flapping.py b/playbooks/roles/healthchecks/files/rdma_link_flapping.py index 425ec54e..2b4b8e8a 100644 --- a/playbooks/roles/healthchecks/files/rdma_link_flapping.py +++ b/playbooks/roles/healthchecks/files/rdma_link_flapping.py @@ -102,7 +102,7 @@ def process_rdma_link_flapping(self): logger.debug(f"RDMA link ({interface}) failed {diff_hours} hours ago") logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") - if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period: + if diff_hours < self.time_interval and last_date_failure_sec > bootup_time_grace_period: logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})") link_issues["failures"].append(f"{interface}: {len(self.link_data[interface]['failures'])}") status = -1 @@ -135,7 +135,7 @@ def process_rdma_link_flapping(self): logger.debug(f"RDMA link ({interface}) down {diff_hours} hours ago") logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}") - if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period: + if diff_hours < self.time_interval and last_date_down_sec > bootup_time_grace_period: logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}") link_issues["link_down"].append(f"{interface}: {len(self.link_data[interface]['link_down'])}") status = -2 From 4fd28977bee116fb6f0bfc6e3a7aba001272e901 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Sat, 13 Apr 2024 00:27:40 -0600 Subject: [PATCH 17/36] Update NCCL tuning parameters --- samples/gpu/nccl_run_allreduce_H100.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/gpu/nccl_run_allreduce_H100.sh b/samples/gpu/nccl_run_allreduce_H100.sh index 2fd714ea..1113978b 100644 --- a/samples/gpu/nccl_run_allreduce_H100.sh +++ b/samples/gpu/nccl_run_allreduce_H100.sh @@ -57,7 +57,7 @@ do --bind-to numa \ -npernode 8 \ --mca coll ^hcoll \ - -x NCCL_CROSS_NIC=1 \ + -x NCCL_CROSS_NIC=2 \ -x NCCL_DEBUG=WARN \ -x NCCL_CUMEM_ENABLE=0 \ -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ @@ -78,6 +78,7 @@ do -x NCCL_IGNORE_CPU_AFFINITY=1 \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ -x NCCL_TOPO_FILE=~/H100-topology.xml \ + -x NCCL_MIN_NCHANNELS=32 \ --np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile tail -n 32 $logfile From ec221f416469020b7201147c11b6e896c5ba72f1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 16 Apr 2024 12:04:20 -0600 Subject: [PATCH 18/36] Add root check --- playbooks/roles/healthchecks/files/xid_checker.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/playbooks/roles/healthchecks/files/xid_checker.py b/playbooks/roles/healthchecks/files/xid_checker.py index eaa6360b..3c46f36a 100644 --- a/playbooks/roles/healthchecks/files/xid_checker.py +++ b/playbooks/roles/healthchecks/files/xid_checker.py @@ -5,9 +5,14 @@ import subprocess import sys import re +import os class XidChecker: def __init__(self, dmesg_cmd="dmesg", time_interval=60): + # if user is root + if not os.geteuid() == 0: + logger.info("The XidChecker script did not run since it must be run as root") + sys.exit(1) self.dmesg_cmd = dmesg_cmd self.results = {} From bf9af6e40a9fb8665046a5c51e82adaf73bf5f71 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 16 Apr 2024 14:45:40 -0600 Subject: [PATCH 19/36] Update to latest version --- autoscaling/tf_init/versions.tf | 2 +- versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 28a169ec..ec66572c 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "5.34.0" + version = "5.37.0" } } } \ No newline at end of file diff --git a/versions.tf b/versions.tf index 28a169ec..ec66572c 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "5.34.0" + version = "5.37.0" } } } \ No newline at end of file From 6bdf4d404ef365ec3b5470be133cf703cddd8031 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:50:17 -0600 Subject: [PATCH 20/36] Check Physical Error in case of bad signal integrity --- playbooks/roles/healthchecks/files/check_gpu_setup.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py index a57703e4..35663e3d 100644 --- a/playbooks/roles/healthchecks/files/check_gpu_setup.py +++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py @@ -233,7 +233,7 @@ def check_rdma_link_status(): vendor_serial_num = re.search(r'Vendor Serial Number.*', output).group().split(":")[1].strip() nic_fw_version = re.search(r'Firmware Version.*', output).group().split(":")[1].strip() cable_fw_version = re.search(r'FW Version.*', output).group().split(":")[1].strip() - + physical_BER = re.search(r'Raw Physical BER.*', output).group().split(":")[1].strip() # Remove hidden characters from the output link_state = re.sub(color_pattern, '', link_state) nic_fw_version = re.sub(color_pattern, '', nic_fw_version) @@ -248,8 +248,12 @@ def check_rdma_link_status(): status = False if recommendation != "No issue was observed": logger.debug(f"{device}: {recommendation}") - link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}") - status = False + if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-09: + logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored") + else : + logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}") + link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}") + status = False else: logger.debug(f"{device}: {recommendation}") From c929c9b47b466be13af8ca51c2fa0efdbe085988 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Mon, 22 Apr 2024 17:58:43 -0600 Subject: [PATCH 21/36] Change default H100 values --- samples/gpu/nccl_run_allreduce_H100.sbatch | 16 +++++++++++----- samples/gpu/nccl_run_allreduce_H100.sh | 14 ++++++++++---- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch index efef481e..333da972 100644 --- a/samples/gpu/nccl_run_allreduce_H100.sbatch +++ b/samples/gpu/nccl_run_allreduce_H100.sbatch @@ -43,11 +43,11 @@ fi --bind-to numa \ -npernode 8 \ --mca coll ^hcoll \ - -x NCCL_CROSS_NIC=1 \ + -x NCCL_CROSS_NIC=2 \ -x NCCL_DEBUG=WARN \ -x NCCL_CUMEM_ENABLE=0 \ -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ - -x NCCL_IB_QPS_PER_CONNECTION=16 \ + -x NCCL_IB_QPS_PER_CONNECTION=1 \ -x NCCL_IB_GID_INDEX=3 \ -x NCCL_IB_TC=41 \ -x NCCL_IB_SL=0 \ @@ -59,11 +59,17 @@ fi -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ -x RX_QUEUE_LEN=8192 \ -x IB_RX_QUEUE_LEN=8192 \ - -x NCCL_BUFFSIZE=16777216 \ - -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \ -x NCCL_IGNORE_CPU_AFFINITY=1 \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 # If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances - # -x NCCL_TOPO_FILE=~/H100-topology.xml \ \ No newline at end of file + # -x NCCL_TOPO_FILE=~/H100-topology.xml \ + + # If NCCL version is lower than 2.20.3, it is recommended to use + # -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes) + # -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes) + + # If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with + # -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs. \ No newline at end of file diff --git a/samples/gpu/nccl_run_allreduce_H100.sh b/samples/gpu/nccl_run_allreduce_H100.sh index 1113978b..56207fc7 100644 --- a/samples/gpu/nccl_run_allreduce_H100.sh +++ b/samples/gpu/nccl_run_allreduce_H100.sh @@ -61,7 +61,7 @@ do -x NCCL_DEBUG=WARN \ -x NCCL_CUMEM_ENABLE=0 \ -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ - -x NCCL_IB_QPS_PER_CONNECTION=16 \ + -x NCCL_IB_QPS_PER_CONNECTION=1 \ -x NCCL_IB_GID_INDEX=3 \ -x NCCL_IB_TC=41 \ -x NCCL_IB_SL=0 \ @@ -74,11 +74,10 @@ do -x RX_QUEUE_LEN=8192 \ -x IB_RX_QUEUE_LEN=8192 \ -x NCCL_BUFFSIZE=16777216 \ - -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \ -x NCCL_IGNORE_CPU_AFFINITY=1 \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ -x NCCL_TOPO_FILE=~/H100-topology.xml \ - -x NCCL_MIN_NCHANNELS=32 \ --np $np --hostfile $hostfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >> $logfile tail -n 32 $logfile @@ -87,4 +86,11 @@ done # If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances - # -x NCCL_TOPO_FILE=~/H100-topology.xml \ \ No newline at end of file + # -x NCCL_TOPO_FILE=~/H100-topology.xml \ + + # If NCCL version is lower than 2.20.3, it is recommended to use + # -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes) + # -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes) + + # If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with + # -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs. \ No newline at end of file From b5cd23bc05543bdfbe26ce4e3230184ccc88c721 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Fri, 26 Apr 2024 13:10:18 -0600 Subject: [PATCH 22/36] Remove duplicate spack --- playbooks/resize_add.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 47b7071b..8a599590 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -113,9 +113,6 @@ - include_role: name: sssd when: ldap|default(true)|bool - - include_role: - name: spack - when: spack|default(false)|bool - hosts: compute_to_add become: true From 3b3624b46144f16af57112c6a7e75325489e1531 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 1 May 2024 10:13:41 -0600 Subject: [PATCH 23/36] Change the BER limits to 10E-7 --- playbooks/roles/healthchecks/files/check_gpu_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py index 35663e3d..ed9132c0 100644 --- a/playbooks/roles/healthchecks/files/check_gpu_setup.py +++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py @@ -248,7 +248,7 @@ def check_rdma_link_status(): status = False if recommendation != "No issue was observed": logger.debug(f"{device}: {recommendation}") - if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-09: + if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-07: logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored") else : logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}") From 9d8050a6e59288b02b467bdac9e7ca5ac174bbb8 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 1 May 2024 10:14:16 -0600 Subject: [PATCH 24/36] Change default NCCL.conf for H100 --- playbooks/roles/nccl-conf/files/h100 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/nccl-conf/files/h100 b/playbooks/roles/nccl-conf/files/h100 index d199d0fb..9fcc8296 100644 --- a/playbooks/roles/nccl-conf/files/h100 +++ b/playbooks/roles/nccl-conf/files/h100 @@ -1,8 +1,8 @@ -NCCL_CROSS_NIC=1 +NCCL_CROSS_NIC=2 NCCL_DEBUG=WARN NCCL_CUMEM_ENABLE=0 NCCL_IB_SPLIT_DATA_ON_QPS=0 -NCCL_IB_QPS_PER_CONNECTION=16 +NCCL_IB_QPS_PER_CONNECTION=1 NCCL_IB_GID_INDEX=3 NCCL_IB_TC=41 NCCL_IB_SL=0 From 7f499aeca614c81269000e9d00e798145d659f16 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 1 May 2024 10:14:44 -0600 Subject: [PATCH 25/36] Don't include Tuner in the stack --- playbooks/roles/nccl-conf/tasks/main.yml | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/playbooks/roles/nccl-conf/tasks/main.yml b/playbooks/roles/nccl-conf/tasks/main.yml index e4b6aed4..88c9dc36 100644 --- a/playbooks/roles/nccl-conf/tasks/main.yml +++ b/playbooks/roles/nccl-conf/tasks/main.yml @@ -33,24 +33,4 @@ owner: root group: root mode: '0644' - when: shape_nccl.stdout == '"BM.GPU4.8"' - -- name: copy libnccl-ocituner for OL - become: true - get_url: - url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-OL - dest: /home/opc/libnccl-ocituner.so.1.0.1 - owner: opc - group: privilege - mode: '0775' - when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_distribution == 'OracleLinux' - -- name: copy libnccl-ocituner for Ubuntu - become: true - get_url: - url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-ubuntu - dest: /home/ubuntu/libnccl-ocituner.so.1.0.1 - owner: ubuntu - group: privilege - mode: '0775' - when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_os_family == 'Debian' \ No newline at end of file + when: shape_nccl.stdout == '"BM.GPU4.8"' \ No newline at end of file From 03292f69cc3a65ae0b50fa868a1c46d81fa50b11 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Fri, 3 May 2024 09:32:04 -0600 Subject: [PATCH 26/36] Make warning message more explicit for BIOS changes --- schema.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index 033117fd..66946082 100755 --- a/schema.yaml +++ b/schema.yaml @@ -837,7 +837,8 @@ variables: BIOS: title: "Modify BIOS options" - description: "Make sure that the BIOS options are changeable for the specific shape selected" + description: "WARNING : Do NOT change those if you have not tested the changes on a single instance. Error will be \"Shape does not support the provided platform +configuration\" " type: boolean default: false visible: true From f08d5d9e28b49f0e4349f6a09fc301a7795d07aa Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 8 May 2024 15:20:31 -0700 Subject: [PATCH 27/36] by default, disable scratch nfs --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index 033117fd..b3f2c948 100755 --- a/schema.yaml +++ b/schema.yaml @@ -907,7 +907,7 @@ variables: visible: and: - ${use_advanced} - default: true + default: false scratch_nfs_type_cluster: type: enum From fa5bb5880939753f1bbd2dc352c544dc990c4de4 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 14 May 2024 13:52:08 -0600 Subject: [PATCH 28/36] Change healthchecks default to False --- schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.yaml b/schema.yaml index 66946082..10504223 100755 --- a/schema.yaml +++ b/schema.yaml @@ -1201,7 +1201,7 @@ configuration\" " healthchecks: type: boolean title: "Turn on Healthchecks for GPU nodes" - default: true + default: false description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state" visible: ${slurm} From 9ba74808c253a83bc2e30b2e2f2fa3c2ad9b256e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 14 May 2024 13:53:02 -0600 Subject: [PATCH 29/36] Add list of unreachable instances in resize --- bin/resize.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/resize.py b/bin/resize.py index 9525fee4..03a6f58e 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -753,6 +753,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index if len(unreachable_instances): if not remove_unreachable: print("STDOUT: At least one unreachable node is in the inventory") + print(unreachable_instances) print("STDOUT: Not doing anything") exit(1) else: From 57adf1e6c0aa92b890b3abce702ced4a13365ab5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 14 May 2024 13:53:20 -0600 Subject: [PATCH 30/36] Add Tuner example --- samples/gpu/nccl_run_allreduce_tuner.sbatch | 88 +++++++++++++++++ samples/gpu/nccl_run_allreduce_tuner.sh | 103 ++++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 samples/gpu/nccl_run_allreduce_tuner.sbatch create mode 100644 samples/gpu/nccl_run_allreduce_tuner.sh diff --git a/samples/gpu/nccl_run_allreduce_tuner.sbatch b/samples/gpu/nccl_run_allreduce_tuner.sbatch new file mode 100644 index 00000000..f6924781 --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_tuner.sbatch @@ -0,0 +1,88 @@ +#!/bin/bash +#SBATCH --job-name=nccl-allreduce-slurm +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --exclusive +export PMI_DEBUG=1 + + +cd /nfs/scratch +mkdir $SLURM_JOB_ID +cd $SLURM_JOB_ID + +MACHINEFILE="hostfile" +ORDEREDMACHINEFILE="ordered_hostfile_system_name" +ORDEREDRANKMACHINEFILE="rankfile_system_name" + +scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE +echo MACHINEFILE +cat $MACHINEFILE + +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +fi + + +echo ORDEREDMACHINEFILE +cat $ORDEREDMACHINEFILE +echo ORDEREDRANKMACHINEFILE +cat $ORDEREDRANKMACHINEFILE + +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path + +export NCCL_DEBUG=WARN + +#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 +# no need to pass: -x SLURM_JOB_NODELIST=$host_list + +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] +then + var_UCX_NET_DEVICES=mlx5_0:1 + var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12" +elif [ $shape == \"BM.GPU4.8\" ] +then + var_UCX_NET_DEVICES=mlx5_4:1 + var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17" +fi + +NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'` +arr_NCCL=(${NCCL_version//./ }) +if [ ${arr_NCCL[2]} > 20 ] +then + tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1 +else + tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2 +fi + + + mpirun --mca pml ucx \ + --bind-to numa \ + --mca coll ^hcoll \ + -x NCCL_DEBUG=WARN \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_QPS_PER_CONNECTION=4 \ + -x UCX_TLS=ud,self,sm \ + -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x coll_hcoll_enable=0 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_TUNER_PLUGIN=${tuner_path} \ + -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 + + diff --git a/samples/gpu/nccl_run_allreduce_tuner.sh b/samples/gpu/nccl_run_allreduce_tuner.sh new file mode 100644 index 00000000..a8da9ba0 --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_tuner.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -e + +# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter +max=$1 + +# This assume, the hostfile passed is already ordered based on their rackId +if [ -n "$2" ]; then + hostfile=$2 +else + #hostfile="/home/opc/hostfile.tcp" + #hostfile="/etc/opt/oci-hpc/hostfile.tcp" + hostfile="/tmp/ordered_hostfile_system_name" +fi + +ORDEREDMACHINEFILE="ordered_hostfile_system_name" +ORDEREDRANKMACHINEFILE="rankfile_system_name" +echo INPUTFILE +cat $hostfile + +# will generate rack-aware ordered host file +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null +fi + +hostfile=$ORDEREDMACHINEFILE +rankfile=$ORDEREDRANKMACHINEFILE + +echo ORDEREDMACHINEFILE +cat $ORDEREDMACHINEFILE +echo ORDEREDRANKMACHINEFILE +cat $ORDEREDRANKMACHINEFILE + +# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used. +if [ -n "$3" ]; then + np=$3 +else + np=$((`less $hostfile | wc -l` * 8 )) +fi + +logfile="nccl_run_allreduce.sh.log" + +for x in $(seq 1 1 $max) +do + + echo $x + echo $x >> $logfile + date >> $logfile + + rankfile=$rankfile; np=$np ; iter=20; + + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi + +first_node=`head $hostfile -n 1` +shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] +then + var_UCX_NET_DEVICES=mlx5_0:1 + var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12" +elif [ $shape == \"BM.GPU4.8\" ] +then + var_UCX_NET_DEVICES=mlx5_4:1 + var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17" +fi + +NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'` +arr_NCCL=(${NCCL_version//./ }) +if [ ${arr_NCCL[2]} < 21 ] +then + tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2 +else + tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1 +fi + + # final version + mpirun --mca pml ucx \ + --bind-to numa \ + --mca coll ^hcoll \ + -x NCCL_DEBUG=WARN \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_QPS_PER_CONNECTION=4 \ + -x UCX_TLS=ud,self,sm \ + -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x coll_hcoll_enable=0 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_ALGO=Ring \ + -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ + --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile + + tail -n 32 $logfile + + +done + + From b2f1d732dd0ec614e5bd36c081ce2bd0f7052f4e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 14 May 2024 13:53:36 -0600 Subject: [PATCH 31/36] Remove old tuner examples --- ..._ncclparam_tuner_nccl_run_allreduce.sbatch | 65 -------------- .../no_ncclparam_tuner_nccl_run_allreduce.sh | 87 ------------------- 2 files changed, 152 deletions(-) delete mode 100644 samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch delete mode 100644 samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh diff --git a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch b/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch deleted file mode 100644 index 7de77737..00000000 --- a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash -#SBATCH --job-name=nccl-allreduce-slurm -#SBATCH --nodes=2 -#SBATCH --gpus-per-node=8 -#SBATCH --ntasks-per-node=8 -#SBATCH --exclusive -export PMI_DEBUG=1 - - -cd /nfs/cluster -mkdir $SLURM_JOB_ID -cd $SLURM_JOB_ID - -MACHINEFILE="hostfile" -ORDEREDMACHINEFILE="ordered_hostfile_system_name" -ORDEREDRANKMACHINEFILE="rankfile_system_name" - -scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE -echo MACHINEFILE -cat $MACHINEFILE - -source /etc/os-release -if [ $ID == "ol" ] || [ $ID == "centos" ] ; then - python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null - homedirectory=/home/opc -elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then - python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null - homedirectory=/home/ubuntu -fi - - -echo ORDEREDMACHINEFILE -cat $ORDEREDMACHINEFILE -echo ORDEREDRANKMACHINEFILE -cat $ORDEREDRANKMACHINEFILE - -mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` - -if [[ "$mpivars_path" == "" ]]; then - mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` -fi - -if [[ "$mpivars_path" == "" ]]; then - echo "Could not find MPIPATH"; exit; fi - -source $mpivars_path - -shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` -if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] -then - var_UCX_NET_DEVICES=mlx5_0:1 -elif [ $shape == \"BM.GPU4.8\" ] -then - var_UCX_NET_DEVICES=mlx5_4:1 -fi - - mpirun --mca pml ucx \ - --bind-to numa \ - --mca coll ^hcoll \ - -x UCX_TLS=ud,self,sm \ - -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ - -x HCOLL_ENABLE_MCAST_ALL=0 \ - -x coll_hcoll_enable=0 \ - -x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \ - --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 diff --git a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh b/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh deleted file mode 100644 index 25a496e3..00000000 --- a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash -set -e - -# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter -max=$1 - -# This assume, the hostfile passed is already ordered based on their rackId -if [ -n "$2" ]; then - hostfile=$2 -else - hostfile="/tmp/ordered_hostfile_system_name" -fi - -ORDEREDMACHINEFILE="ordered_hostfile_system_name" -ORDEREDRANKMACHINEFILE="rankfile_system_name" -echo INPUTFILE -cat $hostfile - -# will generate rack-aware ordered host file -source /etc/os-release -if [ $ID == "ol" ] || [ $ID == "centos" ] ; then - python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null - homedirectory=/home/opc -elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then - python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null - homedirectory=/home/ubuntu -fi - -hostfile=$ORDEREDMACHINEFILE -rankfile=$ORDEREDRANKMACHINEFILE - -echo ORDEREDMACHINEFILE -cat $ORDEREDMACHINEFILE -echo ORDEREDRANKMACHINEFILE -cat $ORDEREDRANKMACHINEFILE - -# The number of GPUs to use for the test. Has to be multiplier of 8. If not passed, all GPUs will be used. -if [ -n "$3" ]; then - np=$3 -else - np=$((`less $hostfile | wc -l` * 8 )) -fi - -logfile="nccl_run_allreduce.sh.log" - -for x in $(seq 1 1 $max) -do - - echo $x - echo $x >> $logfile - date >> $logfile - - rankfile=$rankfile; np=$np ; iter=20; - - mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` - source $mpivars_path - - if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi - -first_node=`head $hostfile -n 1` -shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` -if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] -then - var_UCX_NET_DEVICES=mlx5_0:1 -elif [ $shape == \"BM.GPU4.8\" ] -then - var_UCX_NET_DEVICES=mlx5_4:1 -fi - - # final version - # all NCCL parameters are at /etc/nccl.conf on each compute node. - mpirun --mca pml ucx \ - --bind-to numa \ - --mca coll ^hcoll \ - -x UCX_TLS=ud,self,sm \ - -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ - -x HCOLL_ENABLE_MCAST_ALL=0 \ - -x coll_hcoll_enable=0 \ - -x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \ - --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile - - tail -n 32 $logfile - - -done - - From 5241b3da9165cc468c3a95bb19431ce635a4f1cb Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Tue, 14 May 2024 13:53:54 -0600 Subject: [PATCH 32/36] Add GPU and RDMA monitoring --- .../tf_init/cluster-network-configuration.tf | 8 +++++++- .../tf_init/instance-pool-configuration.tf | 17 ++++++++++++++++- cluster-network-configuration.tf | 8 +++++++- compute-nodes.tf | 8 +++++++- instance-pool-configuration.tf | 17 ++++++++++++++++- 5 files changed, 53 insertions(+), 5 deletions(-) diff --git a/autoscaling/tf_init/cluster-network-configuration.tf b/autoscaling/tf_init/cluster-network-configuration.tf index 6b2805f1..3b12b2f9 100755 --- a/autoscaling/tf_init/cluster-network-configuration.tf +++ b/autoscaling/tf_init/cluster-network-configuration.tf @@ -41,7 +41,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati name = "Compute HPC RDMA Auto-Configuration" desired_state = plugins_config.value } - + } + dynamic plugins_config { + for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value + } } } dynamic "platform_config" { diff --git a/autoscaling/tf_init/instance-pool-configuration.tf b/autoscaling/tf_init/instance-pool-configuration.tf index 31c31ab7..16f8f32c 100755 --- a/autoscaling/tf_init/instance-pool-configuration.tf +++ b/autoscaling/tf_init/instance-pool-configuration.tf @@ -18,8 +18,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" { user_data = base64encode(data.template_file.config.rendered) } agent_config { - is_management_disabled = true + + are_all_plugins_disabled = false + is_management_disabled = true + is_monitoring_disabled = false + + plugins_config { + desired_state = "DISABLED" + name = "OS Management Service Agent" + } + dynamic plugins_config { + for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value + } } + } shape = var.instance_pool_shape dynamic "shape_config" { diff --git a/cluster-network-configuration.tf b/cluster-network-configuration.tf index 1c097ca5..f2772b2a 100755 --- a/cluster-network-configuration.tf +++ b/cluster-network-configuration.tf @@ -45,7 +45,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati name = "Compute HPC RDMA Auto-Configuration" desired_state = plugins_config.value } - + } + dynamic plugins_config { + for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value + } } } diff --git a/compute-nodes.tf b/compute-nodes.tf index 1544c5ad..4149e958 100755 --- a/compute-nodes.tf +++ b/compute-nodes.tf @@ -48,7 +48,13 @@ resource "oci_core_instance" "compute_cluster_instances" { name = "Compute HPC RDMA Auto-Configuration" desired_state = plugins_config.value } - + } + dynamic plugins_config { + for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value + } } } diff --git a/instance-pool-configuration.tf b/instance-pool-configuration.tf index b28dbe5c..1fffd744 100755 --- a/instance-pool-configuration.tf +++ b/instance-pool-configuration.tf @@ -22,8 +22,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" { user_data = base64encode(data.template_file.config.rendered) } agent_config { - is_management_disabled = true + + are_all_plugins_disabled = false + is_management_disabled = true + is_monitoring_disabled = false + + plugins_config { + desired_state = "DISABLED" + name = "OS Management Service Agent" + } + dynamic plugins_config { + for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"] + content { + name = "Compute RDMA GPU Monitoring" + desired_state = plugins_config.value + } } + } shape = var.instance_pool_shape dynamic "shape_config" { From 6c14e8152e53f7b25216836bffb4b7052a36bc22 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 15 May 2024 09:55:20 -0600 Subject: [PATCH 33/36] Update images --- schema.yaml | 30 +++++++++++++++--------------- variables.tf | 12 ++++++------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/schema.yaml b/schema.yaml index 10504223..a2bea948 100755 --- a/schema.yaml +++ b/schema.yaml @@ -418,11 +418,11 @@ variables: enum: - "HPC_OL7" - "HPC_OL8" - - "GPU_OL7_CUDA12.2" - - "GPU_OL8_CUDA12.2" - - "GPU_OL7_CUDA12.4" - - "GPU_OL8_CUDA12.4" - default: "HPC_OL7" + - "GPU_OL8_NV550" + - "GPU_OL7_NV550" + - "GPU_OL8_NV535" + - "GPU_OL7_NV535" + default: "GPU_OL8_NV550" visible: ${use_marketplace_image_controller} controller_username: @@ -762,11 +762,11 @@ variables: enum: - "HPC_OL7" - "HPC_OL8" - - "GPU_OL7_CUDA12.2" - - "GPU_OL8_CUDA12.2" - - "GPU_OL7_CUDA12.4" - - "GPU_OL8_CUDA12.4" - default: "HPC_OL7" + - "GPU_OL8_NV550" + - "GPU_OL7_NV550" + - "GPU_OL8_NV535" + - "GPU_OL7_NV535" + default: "HPC_OL8" visible: ${use_marketplace_image} use_compute_agent: @@ -1681,11 +1681,11 @@ configuration\" " enum: - "HPC_OL7" - "HPC_OL8" - - "GPU_OL7_CUDA12.2" - - "GPU_OL8_CUDA12.2" - - "GPU_OL7_CUDA12.4" - - "GPU_OL8_CUDA12.4" - default: "HPC_OL7" + - "GPU_OL8_NV550" + - "GPU_OL7_NV550" + - "GPU_OL8_NV535" + - "GPU_OL7_NV535" + default: "HPC_OL8" visible: and: - ${use_marketplace_image_login} diff --git a/variables.tf b/variables.tf index c6067f23..8cc2c70b 100755 --- a/variables.tf +++ b/variables.tf @@ -87,12 +87,12 @@ variable "marketplace_listing" { variable "marketplace_version_id" { type = map(string) default = { - "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" - "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0" - "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0" - "GPU_OL8_CUDA12.2" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0" - "GPU_OL7_CUDA12.4" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0" - "GPU_OL8_CUDA12.4" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0" + "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.05.08-0" + "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.05.08-0" + "GPU_OL8_NV550" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.05.08-0" + "GPU_OL7_NV550" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.05.13-0" + "GPU_OL8_NV535" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.05.08-0" + "GPU_OL7_NV535" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.05.13-0" } } From ddf95e7ed2a1dfa0ef43293c32d418e266e42f24 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 15 May 2024 11:19:13 -0600 Subject: [PATCH 34/36] Fix default images --- schema.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/schema.yaml b/schema.yaml index a2bea948..136c506a 100755 --- a/schema.yaml +++ b/schema.yaml @@ -422,7 +422,7 @@ variables: - "GPU_OL7_NV550" - "GPU_OL8_NV535" - "GPU_OL7_NV535" - default: "GPU_OL8_NV550" + default: "HPC_OL8" visible: ${use_marketplace_image_controller} controller_username: @@ -766,7 +766,7 @@ variables: - "GPU_OL7_NV550" - "GPU_OL8_NV535" - "GPU_OL7_NV535" - default: "HPC_OL8" + default: "GPU_OL8_NV550" visible: ${use_marketplace_image} use_compute_agent: From b7af925469988971dc4de05bea4efef4348d6ad6 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Wed, 15 May 2024 15:46:27 -0600 Subject: [PATCH 35/36] Make sure scratch is disabled by default --- variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variables.tf b/variables.tf index 8cc2c70b..fcee9d94 100755 --- a/variables.tf +++ b/variables.tf @@ -41,7 +41,7 @@ variable "use_compute_agent" { default = true } variable "unsupported_controller_image" { default = "" } variable "unsupported_login_image" { default = "" } variable "use_cluster_nfs" { default = true} -variable "use_scratch_nfs" { default = true } +variable "use_scratch_nfs" { default = false } variable "cluster_nfs_path" { default = "/nfs/cluster" } variable "scratch_nfs_path" { default = "/nfs/scratch" } variable "vcn_compartment" { default = ""} From a89a1b2f00d1f915bd092245a2e1ab485382bdcd Mon Sep 17 00:00:00 2001 From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com> Date: Fri, 17 May 2024 09:51:40 -0600 Subject: [PATCH 36/36] Update healthcheck boolean --- playbooks/roles/slurm/tasks/common.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 755bc51f..865312ca 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -210,4 +210,5 @@ dest: "{{ slurm_conf_path }}/prolog.d/healthchecks.sh" owner: root group: root - mode: '0755' \ No newline at end of file + mode: '0755' + when: healthchecks|bool \ No newline at end of file