From cbd27c8fbcc3b9efc4e79c4eb1aedde80e90af84 Mon Sep 17 00:00:00 2001
From: Dhvani Sheth <shethdhvani@gmail.com>
Date: Wed, 6 Mar 2024 10:26:25 -0800
Subject: [PATCH 01/36] add a prompt when removing nodes from the cluster

---
 bin/remove_nodes_prompt.txt |  4 ++++
 bin/resize.sh               | 20 +++++++++++++++++++-
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 bin/remove_nodes_prompt.txt

diff --git a/bin/remove_nodes_prompt.txt b/bin/remove_nodes_prompt.txt
new file mode 100644
index 00000000..dcc6792d
--- /dev/null
+++ b/bin/remove_nodes_prompt.txt
@@ -0,0 +1,4 @@
+Does your cluster run any file system like Ceph, NFS, etc. on the GPU/HPC nodes itself using local NVMe SSDs?  
+If yes, terminating nodes which store your data can result in permanent data loss, so before proceeding make sure any important data is copied to a persistent file system outside of the cluster such as to object storage, file storage, etc.
+Once data is backed up or migrated, come back and run the script. Select 2 to exit.
+Remember, once the nodes are terminated, all the data is lost forever and you won't be able to recover it.
\ No newline at end of file
diff --git a/bin/resize.sh b/bin/resize.sh
index 92dea986..1080ca25 100755
--- a/bin/resize.sh
+++ b/bin/resize.sh
@@ -15,6 +15,12 @@ then
   exit
 fi
 
+if [ $USER != "ubuntu" ] && [ $USER != "opc" ]
+then
+  echo "Run this script as opc or ubuntu"
+  exit
+fi
+
 if [ $# -eq 0 ]
 then
   python3 $folder/resize.py --help
@@ -51,6 +57,18 @@ for (( i=1; i<=$#; i++)); do
     fi
 done
 
+if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ]
+then
+  echo "$(cat $folder/remove_nodes_prompt.txt)"
+  echo "Do you confirm you have done all of the above steps and wish to proceed for the termination of the nodes? Enter 1 for Yes and 2 for No (to exit)."
+  select yn in "Yes" "No"; do
+    case $yn in
+        Yes ) break;;
+        No ) exit;;
+    esac
+  done
+fi
+
 if [ $resize_type != "default" ]
 then
   if [ $permanent -eq 0 ]
@@ -148,5 +166,5 @@ then
     rm currently_resizing
   fi
 else
-  python3 $folder/resize.py ${@} 
+  python3 $folder/resize.py ${@} &
 fi

From da709849b6a7e77d1558219e3f5ec95db7684b34 Mon Sep 17 00:00:00 2001
From: Dhvani Sheth <shethdhvani@gmail.com>
Date: Fri, 15 Mar 2024 17:03:33 -0700
Subject: [PATCH 02/36] in progress for adding silent mode

---
 autoscaling/crontab/autoscale_slurm.sh | 6 +++---
 bin/delete_cluster.sh                  | 2 +-
 bin/resize.py                          | 1 +
 bin/resize.sh                          | 6 +++++-
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh
index 9882b44c..b08e639e 100755
--- a/autoscaling/crontab/autoscale_slurm.sh
+++ b/autoscaling/crontab/autoscale_slurm.sh
@@ -364,7 +364,7 @@ try:
         initial_nodes=[]
         unreachable_nodes=[]
         if cluster_name == "NOCLUSTERFOUND":
-            subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name])
+            subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
             continue
         for node in nodes_to_destroy[cluster_name]:
             try:
@@ -376,9 +376,9 @@ try:
             except:
                 unreachable_nodes.append(node)    
         if len(initial_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes)
+            subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
         if len(unreachable_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes)
+            subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
         time.sleep(1)
 
     for index,cluster in enumerate(cluster_to_build):
diff --git a/bin/delete_cluster.sh b/bin/delete_cluster.sh
index 7328a206..983e10a3 100755
--- a/bin/delete_cluster.sh
+++ b/bin/delete_cluster.sh
@@ -103,7 +103,7 @@ else
         for node in `scontrol show hostname $nodes 2>&1`
         do
             echo "Cleaning up node " $node
-            /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node 
+            /opt/oci-hpc/bin/resize.sh remove_unreachable --nodes $node --quiet
         done
     fi
     cd
diff --git a/bin/resize.py b/bin/resize.py
index 9525fee4..8710b61f 100644
--- a/bin/resize.py
+++ b/bin/resize.py
@@ -577,6 +577,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
 parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False)
 parser.add_argument('--ansible_crucial', help='If present during reconfiguration, only crucial ansible playbooks will be executed on the live nodes. Non live nodes will be removed',action='store_true',default=False)
 parser.add_argument('--remove_unreachable', help='If present, nodes that are not sshable will be terminated before running the action that was requested (Example Adding a node) ',action='store_true',default=False)
+parser.add_argument('--quiet', help='If present, the script will not prompt for a response when removing nodes and will not give a reminder to save data from nodes that are being removed ',action='store_true',default=False)
 
 args = parser.parse_args()
 
diff --git a/bin/resize.sh b/bin/resize.sh
index 1080ca25..cbcb4232 100755
--- a/bin/resize.sh
+++ b/bin/resize.sh
@@ -32,6 +32,7 @@ permanent=1
 controllerName=`hostname`
 cluster_name=${controllerName/-controller/}
 nodes=NULL
+quietMode=False
 for (( i=1; i<=$#; i++)); do
     if [ ${!i} == "--cluster_name" ]
     then
@@ -54,10 +55,13 @@ for (( i=1; i<=$#; i++)); do
     then
       j=$((i+1))
       nodes=${@:j}
+    elif [ ${!i} == "--quiet" ]
+    then
+      quietMode=True
     fi
 done
 
-if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ]
+if [ $resize_type == "remove" ] || [ $resize_type == "remove_unreachable" ] && [ $quietMode == "False" ]
 then
   echo "$(cat $folder/remove_nodes_prompt.txt)"
   echo "Do you confirm you have done all of the above steps and wish to proceed for the termination of the nodes? Enter 1 for Yes and 2 for No (to exit)."

From 599bf403f9da4736658bfc8e99bf9dfa93d4e14e Mon Sep 17 00:00:00 2001
From: Dhvani Sheth <shethdhvani@gmail.com>
Date: Fri, 15 Mar 2024 17:10:43 -0700
Subject: [PATCH 03/36] fix for resize.sh remove failing when no nfs is defined

---
 bin/resize.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bin/resize.py b/bin/resize.py
index 9525fee4..9c9f7217 100644
--- a/bin/resize.py
+++ b/bin/resize.py
@@ -447,7 +447,10 @@ def getNFSnode(inventory):
         return ''
     if len(dict['nfs']) == 0:
         return ''
-    return dict['nfs'][0].split()[0]
+    if dict['nfs'][0] == '\n':
+        return ''
+    else:
+        return dict['nfs'][0].split()[0]
 
 def get_summary(comp_ocid,cluster_name):
     CN = "CN"

From 39b1db4abded7af025ca88552c09645f4b3caf6a Mon Sep 17 00:00:00 2001
From: Dhvani Sheth <shethdhvani@gmail.com>
Date: Tue, 19 Mar 2024 13:38:37 -0700
Subject: [PATCH 04/36] added the check for autoscaling value in
 /etc/ansible/hosts

---
 autoscaling/crontab/autoscale_slurm.sh | 224 +++++++++++++------------
 1 file changed, 120 insertions(+), 104 deletions(-)

diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh
index b08e639e..8ccde4ef 100755
--- a/autoscaling/crontab/autoscale_slurm.sh
+++ b/autoscaling/crontab/autoscale_slurm.sh
@@ -91,8 +91,8 @@ def getIdleTime(node):
     return ( datetime.datetime.now() - right_time ).total_seconds()
 
 # Get the last time a node state was changed. This is used to get how long a cluster has been idle for
-def getQueueConf(file):
-    with open(queues_conf_file) as file:
+def getQueueConf(queue_file):
+    with open(queue_file) as file:
         try:
             data = yaml.load(file,Loader=yaml.FullLoader)
         except:
@@ -328,109 +328,125 @@ def getstatus_slurm():
             cluster_destroying.append(clusterName)
     return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes
 
-if os.path.isfile(lockfile):
-    print( "Lockfile "+lockfile + " is present, exiting" )
-    exit()
-open(lockfile,'w').close()
-try:
-    path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
-    clusters_path = os.path.join(path,'clusters')
-    config = getQueueConf(queues_conf_file)
-
-    cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
-
-    print (time.strftime("%Y-%m-%d %H:%M:%S"))
-    print (cluster_to_build,'cluster_to_build')
-    print (cluster_to_destroy,'cluster_to_destroy')
-    print (nodes_to_destroy,'nodes_to_destroy')
-    print (cluster_building,'cluster_building')
-    print (cluster_destroying,'cluster_destroying')
-    print (current_nodes,'current_nodes')
-    print (building_nodes,'building_nodes')
-
-    for i in cluster_building:
-        for j in cluster_to_build:
-            if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
-                cluster_to_build.remove(j)
-                break
-    for cluster in cluster_to_destroy:
-        cluster_name=cluster[0]
-        print ("Deleting cluster "+cluster_name)
-        subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
-        time.sleep(5)
-
-    for cluster_name in nodes_to_destroy.keys():
-        print ("Resizing cluster "+cluster_name)
-        initial_nodes=[]
-        unreachable_nodes=[]
-        if cluster_name == "NOCLUSTERFOUND":
-            subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
-            continue
-        for node in nodes_to_destroy[cluster_name]:
+def getAutoscaling():
+    out = subprocess.Popen(["cat /etc/ansible/hosts | grep 'autoscaling =' | awk -F  '= ' '{print $2}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True)
+    stdout,stderr = out.communicate()
+    output = stdout.split("\n")
+    autoscaling_value=False
+    for i in range(0,len(output)-1):
+        autoscaling_value=output[i]
+    return autoscaling_value
+
+autoscaling = getAutoscaling()
+
+if autoscaling == "true":
+
+    if os.path.isfile(lockfile):
+        print( "Lockfile "+lockfile + " is present, exiting" )
+        exit()
+    open(lockfile,'w').close()
+    try:
+        path = os.path.dirname(os.path.dirname(os.path.realpath(sys.argv[0])))
+        clusters_path = os.path.join(path,'clusters')
+        config = getQueueConf(queues_conf_file)
+
+        cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes=getstatus_slurm()
+
+        print (time.strftime("%Y-%m-%d %H:%M:%S"))
+        print (cluster_to_build,'cluster_to_build')
+        print (cluster_to_destroy,'cluster_to_destroy')
+        print (nodes_to_destroy,'nodes_to_destroy')
+        print (cluster_building,'cluster_building')
+        print (cluster_destroying,'cluster_destroying')
+        print (current_nodes,'current_nodes')
+        print (building_nodes,'building_nodes')
+
+        for i in cluster_building:
+            for j in cluster_to_build:
+                if i[0]==j[0] and i[1]==j[1] and i[2]==j[2]:
+                    cluster_to_build.remove(j)
+                    break
+        for cluster in cluster_to_destroy:
+            cluster_name=cluster[0]
+            print ("Deleting cluster "+cluster_name)
+            subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name])
+            time.sleep(5)
+
+        for cluster_name in nodes_to_destroy.keys():
+            print ("Resizing cluster "+cluster_name)
+            initial_nodes=[]
+            unreachable_nodes=[]
+            if cluster_name == "NOCLUSTERFOUND":
+                subprocess.Popen([script_path+'/resize.sh','remove_unreachable','--nodes']+nodes_to_destroy[cluster_name],'--quiet')
+                continue
+            for node in nodes_to_destroy[cluster_name]:
+                try:
+                    alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
+                    for alt_name in alt_names.split("\n")[0].split():
+                        if alt_name.startswith('inst-'):
+                            initial_nodes.append(alt_name)
+                            break
+                except:
+                    unreachable_nodes.append(node)    
+            if len(initial_nodes) > 0:
+                subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
+            if len(unreachable_nodes) > 0:
+                subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
+            time.sleep(1)
+
+        for index,cluster in enumerate(cluster_to_build):
+            nodes=cluster[0]
+            instance_type = cluster[1]
+            queue=cluster[2]
+            jobID=str(cluster[3])
+            user=str(cluster[4])
+            jobconfig=getJobConfig(config,queue,instance_type)
+            limits=getQueueLimits(config,queue,instance_type)
             try:
-                alt_names=subprocess.check_output(["cat /etc/hosts | grep "+node],shell=True,universal_newlines=True)
-                for alt_name in alt_names.split("\n")[0].split():
-                    if alt_name.startswith('inst-'):
-                        initial_nodes.append(alt_name)
-                        break
+                clusterCount=len(used_index[queue][instance_type])
             except:
-                unreachable_nodes.append(node)    
-        if len(initial_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes,'--quiet')
-        if len(unreachable_nodes) > 0:
-            subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes,'--quiet')
-        time.sleep(1)
-
-    for index,cluster in enumerate(cluster_to_build):
-        nodes=cluster[0]
-        instance_type = cluster[1]
-        queue=cluster[2]
-        jobID=str(cluster[3])
-        user=str(cluster[4])
-        jobconfig=getJobConfig(config,queue,instance_type)
-        limits=getQueueLimits(config,queue,instance_type)
-        try:
-            clusterCount=len(used_index[queue][instance_type])
-        except:
-            clusterCount=0
-        if clusterCount>=limits["max_cluster_count"]:
-            print ("This would go over the number of running clusters, you have reached the max number of clusters")
-            continue
-        nextIndex=None
-        if clusterCount==0:
-            if queue in used_index.keys():
-                used_index[queue][instance_type]=[1]
+                clusterCount=0
+            if clusterCount>=limits["max_cluster_count"]:
+                print ("This would go over the number of running clusters, you have reached the max number of clusters")
+                continue
+            nextIndex=None
+            if clusterCount==0:
+                if queue in used_index.keys():
+                    used_index[queue][instance_type]=[1]
+                else:
+                    used_index[queue]={instance_type:[1]}
+                nextIndex=1
             else:
-                used_index[queue]={instance_type:[1]}
-            nextIndex=1
-        else:
-            for i in range(1,10000):
-                if not i in used_index[queue][instance_type]:
-                    nextIndex=i
-                    used_index[queue][instance_type].append(i)
-                    break
-        clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
-        if not queue in current_nodes.keys():
-            current_nodes[queue]={instance_type:0}
-        else:
-            if not instance_type in current_nodes[queue].keys():
-                current_nodes[queue][instance_type]=0
-        if not queue in building_nodes.keys():
-            building_nodes[queue]={instance_type:0}
-        else:
-            if not instance_type in building_nodes[queue].keys():
-                building_nodes[queue][instance_type]=0
-        if nodes > limits["max_cluster_size"]:
-            print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
-        elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
-            print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
-        else:
-            current_nodes[queue][instance_type]+=nodes
-            clusterCount+=1
-            print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
-            subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
-            time.sleep(5)
+                for i in range(1,10000):
+                    if not i in used_index[queue][instance_type]:
+                        nextIndex=i
+                        used_index[queue][instance_type].append(i)
+                        break
+            clusterName=queue+'-'+str(nextIndex)+'-'+jobconfig["instance_keyword"]
+            if not queue in current_nodes.keys():
+                current_nodes[queue]={instance_type:0}
+            else:
+                if not instance_type in current_nodes[queue].keys():
+                    current_nodes[queue][instance_type]=0
+            if not queue in building_nodes.keys():
+                building_nodes[queue]={instance_type:0}
+            else:
+                if not instance_type in building_nodes[queue].keys():
+                    building_nodes[queue][instance_type]=0
+            if nodes > limits["max_cluster_size"]:
+                print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes per cluster limit")
+            elif current_nodes[queue][instance_type] + building_nodes[queue][instance_type] + nodes > limits["max_number_nodes"]:
+                print ("Cluster "+clusterName+" won't be created, it would go over the total number of nodes limit")
+            else:
+                current_nodes[queue][instance_type]+=nodes
+                clusterCount+=1
+                print ("Creating cluster "+clusterName+" with "+str(nodes)+" nodes")
+                subprocess.Popen([script_path+'/create_cluster.sh',str(nodes),clusterName,instance_type,queue,jobID,user])
+                time.sleep(5)
 
-except Exception:
-    traceback.print_exc()
-os.remove(lockfile)
+    except Exception:
+        traceback.print_exc()
+    os.remove(lockfile)
+else:
+    print("Autoscaling is false")
+    exit()

From 2d8e3bacbc922ef0a949f1e4faac1b4a1000d5d4 Mon Sep 17 00:00:00 2001
From: Dhvani Sheth <shethdhvani@gmail.com>
Date: Tue, 19 Mar 2024 15:06:35 -0700
Subject: [PATCH 05/36] Update Readme

---
 README.md | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 6320d108..3685991c 100644
--- a/README.md
+++ b/README.md
@@ -104,12 +104,12 @@ optional arguments:
   --ansible_crucial     If present during reconfiguration, only crucial
                         ansible playbooks will be executed on the live nodes.
                         Non live nodes will be removed
-  --remove_unreachable  If present, nodes that are not sshable will be removed
-                        from the config. They will however not be removed from
-                        Slurm to avoid losing track of the down nodes. If you
-                        need to remove them from Slurm after terminating the
-                        nodes in the console. Run sudo scontrol update
-                        nodename=name state=Future 
+  --remove_unreachable  If present, nodes that are not sshable will be terminated 
+                        before running the action that was requested
+                        (Example Adding a node)
+  --quiet               If present, the script will not prompt for a response when 
+                        removing nodes and will not give a reminder to save data 
+                        from nodes that are being removed
 ```
 
 **Add nodes** 
@@ -161,6 +161,13 @@ Remove 3 nodes randomly from compute-1-hpc:
 ```
 /opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc
 
+```
+or 
+Remove 3 nodes randomly from compute-1-hpc but do not prompt for a response when removing the nodes and do not give a reminder to save data 
+from nodes that are being removed :  
+```
+/opt/oci-hpc/bin/resize.sh remove 3 --cluster_name compute-1-hpc --quiet
+
 ```
 
 **Reconfigure nodes** 
@@ -208,6 +215,10 @@ Uncomment the line in `crontab -e`:
 ```
 * * * * * /opt/oci-hpc/autoscaling/crontab/autoscale_slurm.sh >> /opt/oci-hpc/logs/crontab_slurm.log 2>&1
 ```
+And in /etc/ansible/hosts, below value should be true
+```
+autoscaling = true
+```
 
 # Submit
 How to submit jobs: 

From 48eb7cc2de46583e43950c1bb4f9129886400167 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Mon, 25 Mar 2024 15:27:01 -0600
Subject: [PATCH 06/36] Fix visibility of Private Zone Name

---
 schema.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/schema.yaml b/schema.yaml
index be651072..e48577fd 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -1062,8 +1062,11 @@ variables:
     title: Private Zone Name
     description: "The zone needs to be private for the stack to be able to add entries"
     type: string
-    visible: ${use_existing_vcn}
-    required: true
+    visible: 
+      and:
+        - ${dns_entries}
+        - ${use_existing_vcn}
+    required: ${dns_entries}
   vcn_subnet:
     type: string
     title: "VCN IP range"

From cfb330d4ff7ed48144a50f5757661868726a030d Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:06:59 -0600
Subject: [PATCH 07/36] Update terraform provider version

---
 autoscaling/tf_init/versions.tf | 2 +-
 versions.tf                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf
index 57e63004..28a169ec 100755
--- a/autoscaling/tf_init/versions.tf
+++ b/autoscaling/tf_init/versions.tf
@@ -3,7 +3,7 @@ terraform {
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "5.30.0"
+         version = "5.34.0"
      }
   }
 }
\ No newline at end of file
diff --git a/versions.tf b/versions.tf
index 57e63004..28a169ec 100755
--- a/versions.tf
+++ b/versions.tf
@@ -3,7 +3,7 @@ terraform {
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "5.30.0"
+         version = "5.34.0"
      }
   }
 }
\ No newline at end of file

From eb84c37c7e42a9ef867199d0959346364e7315bb Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:08:33 -0600
Subject: [PATCH 08/36] Add healthcheck in prolog

---
 autoscaling/tf_init/controller_update.tf      |   3 +-
 autoscaling/tf_init/inventory.tpl             |   3 +-
 conf/variables.tpl                            |   1 +
 controller.tf                                 |   6 +-
 inventory.tpl                                 |   3 +-
 playbooks/new_nodes.yml                       |   2 +
 playbooks/resize_add.yml                      |   2 +
 .../healthchecks/files/check_h100_setup.py    | 507 ++++++++++++++++++
 .../roles/healthchecks/files/gpu_bw_test.py   | 167 ++++++
 .../healthchecks/files/rdma_link_flapping.py  | 171 ++++++
 .../healthchecks/files/shared_logging.py      |   5 +
 .../roles/healthchecks/files/xid_checker.py   | 200 +++++++
 playbooks/roles/healthchecks/tasks/main.yml   |  22 +
 playbooks/roles/slurm/files/healthchecks.sh   |  11 +
 playbooks/roles/slurm/tasks/common.yml        |  20 +-
 playbooks/roles/slurm/tasks/common_pyxis.yml  |  12 +-
 .../templates/{prolog.sh.j2 => pyxis.sh.j2}   |   0
 playbooks/roles/slurm/templates/slurm.conf.j2 |   4 +-
 playbooks/site.yml                            |   2 +
 schema.yaml                                   |   8 +
 slurm_ha.tf                                   |   6 +-
 variables.tf                                  |   4 +-
 22 files changed, 1146 insertions(+), 13 deletions(-)
 create mode 100644 playbooks/roles/healthchecks/files/check_h100_setup.py
 create mode 100644 playbooks/roles/healthchecks/files/gpu_bw_test.py
 create mode 100644 playbooks/roles/healthchecks/files/rdma_link_flapping.py
 create mode 100644 playbooks/roles/healthchecks/files/shared_logging.py
 create mode 100644 playbooks/roles/healthchecks/files/xid_checker.py
 create mode 100755 playbooks/roles/healthchecks/tasks/main.yml
 create mode 100644 playbooks/roles/slurm/files/healthchecks.sh
 rename playbooks/roles/slurm/templates/{prolog.sh.j2 => pyxis.sh.j2} (100%)

diff --git a/autoscaling/tf_init/controller_update.tf b/autoscaling/tf_init/controller_update.tf
index 5d58f76e..ec4ec5ac 100755
--- a/autoscaling/tf_init/controller_update.tf
+++ b/autoscaling/tf_init/controller_update.tf
@@ -77,7 +77,8 @@ resource "local_file" "inventory" {
     compute_username = var.compute_username,
     pam = var.pam,
     sacct_limits = var.sacct_limits,
-    use_compute_agent=var.use_compute_agent
+    use_compute_agent=var.use_compute_agent,
+    healthchecks=var.healthchecks
     })
   filename   = "${local.controller_path}/inventory"
 }
diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl
index 56c20cb9..24a2355d 100755
--- a/autoscaling/tf_init/inventory.tpl
+++ b/autoscaling/tf_init/inventory.tpl
@@ -73,4 +73,5 @@ pam = ${pam}
 sacct_limits=${sacct_limits}
 use_compute_agent=${use_compute_agent}
 zone_name=${zone_name}
-dns_entries=${dns_entries}
\ No newline at end of file
+dns_entries=${dns_entries}
+healthchecks=${healthchecks}
\ No newline at end of file
diff --git a/conf/variables.tpl b/conf/variables.tpl
index 97fc9eb2..c8bde956 100755
--- a/conf/variables.tpl
+++ b/conf/variables.tpl
@@ -31,6 +31,7 @@ variable "private_subnet" {default = "##PRIVATE_SUBNET##"}
 variable "rdma_subnet" { default = "${rdma_subnet}" }
 variable "zone_name" {default = "${zone_name}"}
 variable "dns_entries" {default = "${dns_entries}"}
+variable "healthchecks" {default = "${healthchecks}"}
 variable "slurm" { default = ${slurm} }
 variable "rack_aware" { default = ${rack_aware} }
 variable "pyxis" { default = ${pyxis} }
diff --git a/controller.tf b/controller.tf
index b215ab82..ad45196e 100644
--- a/controller.tf
+++ b/controller.tf
@@ -297,7 +297,8 @@ resource "null_resource" "cluster" {
       region = var.region,
       tenancy_ocid = var.tenancy_ocid,
       api_fingerprint = var.api_fingerprint,
-      api_user_ocid = var.api_user_ocid
+      api_user_ocid = var.api_user_ocid,
+      healthchecks = var.healthchecks
       })
 
     destination   = "/opt/oci-hpc/playbooks/inventory"
@@ -445,7 +446,8 @@ resource "null_resource" "cluster" {
       virt_instr = var.virt_instr,
       access_ctrl = var.access_ctrl,
       numa_nodes_per_socket = var.numa_nodes_per_socket,
-      percentage_of_cores_enabled = var.percentage_of_cores_enabled
+      percentage_of_cores_enabled = var.percentage_of_cores_enabled,
+      healthchecks = var.healthchecks
       })
 
     destination   = "/opt/oci-hpc/conf/variables.tf"
diff --git a/inventory.tpl b/inventory.tpl
index f39e534e..c0824ecd 100755
--- a/inventory.tpl
+++ b/inventory.tpl
@@ -78,4 +78,5 @@ api_user_ocid = ${api_user_ocid}
 sacct_limits=${sacct_limits}
 use_compute_agent=${use_compute_agent}
 zone_name=${zone_name}
-dns_entries=${dns_entries}
\ No newline at end of file
+dns_entries=${dns_entries}
+healthchecks=${healthchecks}
\ No newline at end of file
diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml
index b160873f..39efe46f 100755
--- a/playbooks/new_nodes.yml
+++ b/playbooks/new_nodes.yml
@@ -54,6 +54,8 @@
       when: cluster_network|bool and not use_compute_agent|default(false)|bool
     - include_role: 
         name: nvidia_peermem
+    - include_role: 
+        name: healthchecks
 
 - hosts: controller,slurm_backup,login,compute
   become: true
diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml
index 09be3ecd..47b7071b 100755
--- a/playbooks/resize_add.yml
+++ b/playbooks/resize_add.yml
@@ -52,6 +52,8 @@
       when: cluster_network|bool and not use_compute_agent|default(false)|bool
     - include_role: 
         name: nvidia_peermem
+    - include_role: 
+        name: healthchecks
 
 - hosts: controller,slurm_backup,login,compute
   become: true
diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_h100_setup.py
new file mode 100644
index 00000000..99b0498d
--- /dev/null
+++ b/playbooks/roles/healthchecks/files/check_h100_setup.py
@@ -0,0 +1,507 @@
+#!/usr/bin/env python3
+
+import subprocess
+import re
+import argparse
+from datetime import datetime
+from shared_logging import logger
+from gpu_bw_test import BandwidthTest
+from rdma_link_flapping import LinkFlappingTest
+from xid_checker import XidChecker
+import platform
+import os
+import sys
+
+def is_user_root():
+    # Check if the user is root
+    if os.geteuid() != 0:
+        logger.debug("User is root")
+        return False
+    return True
+
+def get_oca_version():
+    # Run the shell command
+    os_name = platform.system()
+
+
+    if os_name == 'Linux':
+        try:
+            distro = platform.linux_distribution()[0]
+        except:
+            import distro
+            distro = distro.name()
+
+        if 'Ubuntu' in distro:
+            if not is_user_root():
+                result = subprocess.run(['sudo', 'snap', 'info', 'oracle-cloud-agent'], stdout=subprocess.PIPE)
+            else:
+                result = subprocess.run(['snap', 'info', 'oracle-cloud-agent'], stdout=subprocess.PIPE)
+
+            # Decode the output from bytes to string
+            output = result.stdout.decode('utf-8')
+
+            # Define the regular expression pattern for the version
+            pattern = r'installed:\s+(\d+\.\d+\.\d+)'
+            match = re.search(pattern, output)
+            if match:
+                version = match.group(1)
+
+        elif 'Oracle' in distro:
+            result = subprocess.run(['rpm', '-qa'], stdout=subprocess.PIPE)
+
+            # Decode the output from bytes to string
+            output = result.stdout.decode('utf-8')
+
+            # Define the regular expression pattern for the version
+            pattern = r'oracle-cloud-agent-(\d+\.\d+\.\d+)'
+            match = re.search(pattern, output)
+            if match:
+                version = match.group(1)
+
+
+        if version < "1.39.0":
+            logger.error(f"Oracle Cloud Agent: {version} needs to be updated to 1.38.0 or higher")
+        else:
+            logger.info(f"Oracle Cloud Agent: {version}")
+
+        # Return the version
+        return version
+
+def check_rttcc_status():
+    link_status = []
+    devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
+    status = "disabled"
+    status_dict = {"devices": {}}
+    for device in devices:
+        if not is_user_root():
+            command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0']
+        else:
+            command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0']
+        result = subprocess.run(command, stdout=subprocess.PIPE)
+        output = result.stdout.decode('utf-8')
+        filtered_output = [line for line in output.split('\n') if line.startswith('value')]
+        for line in filtered_output:
+            logger.debug(line)
+            if "0x00000001" in line:
+                status_dict["devices"][device] = "enabled"
+
+    for device in status_dict["devices"]:
+        if status_dict["devices"][device] == "enabled":
+            logger.warning(f"RTTCC enabled on {device}")
+            status = "enabled"
+            link_status.append(f"RTTCC enabled on: {device}")
+        else:
+            logger.info(f"RTTCC status for {device}: disabled")
+    if status == "disabled":
+        logger.info(f"RTTCC disabled check: Passed")
+    else:
+        logger.error(f"RTTCC disabled check: Failed")
+
+    return link_status
+
+def check_ecc_errors():
+    ecc_issues = []
+    try:
+        # Run the nvidia-smi -q command
+        result = subprocess.run(['nvidia-smi', '-q'], stdout=subprocess.PIPE)
+    except FileNotFoundError:
+        logger.warning("Skipping SRAM/DRAM ECC Test: nvidia-smi command not found")
+        return []
+
+    # Decode the output from bytes to string
+    output = result.stdout.decode('utf-8')
+
+    # Find the lines containing "SRAM Correctable" and "DRAM Correctable"
+    sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output)
+    if len(sram_matches)==0:
+        sram_matches = re.findall(r'SRAM Uncorrectable SEC-DED\s+:\s+(\d+)', output)
+    dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output)
+    gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output)
+    vol_sram_line = sram_matches[0::2]
+    vol_dram_line = dram_matches[0::2]
+    agg_sram_line = sram_matches[1::2]
+    agg_dram_line = dram_matches[1::2]
+
+    for i, gpu in enumerate(gpu_matches):
+        logger.debug(f"GPU: {gpu}")
+        if vol_sram_line[i] != "0":
+            logger.debug(f"Volatile SRAM Uncorrectable: {vol_sram_line[i]}")
+            ecc_issues.append(f"{gpu_matches[i]} - Volatile SRAM Uncorrectable: {vol_sram_line[i]}")
+        if vol_dram_line[i] != "0":
+            logger.debug(f"Volatile DRAM Uncorrectable: {vol_dram_line[i]}")
+            ecc_issues.append(f"{gpu_matches[i]} - Volatile DRAM Uncorrectable: {vol_dram_line[i]}")
+        if agg_sram_line[i] != "0":
+            logger.debug(f"Aggregate SRAM Uncorrectable: {agg_sram_line[i]}")
+            ecc_issues.append(f"{gpu_matches[i]} - Aggregate SRAM Uncorrectable: {agg_sram_line[i]}")
+        if agg_dram_line[i] != "0":
+            logger.debug(f"Aggregate DRAM Uncorrectable: {agg_dram_line[i]}")
+            ecc_issues.append(f"{gpu_matches[i]} - Aggregate DRAM Uncorrectable: {agg_dram_line[i]}")
+
+
+    # Check if there are ecc_issues
+    if len(ecc_issues) == 0:
+        logger.info("GPU ECC Test: Passed")
+    else:
+        logger.warning("GPU ECC Test: Failed")
+
+    return ecc_issues
+
+def check_row_remap_errors():
+    remap_issues = []
+    try:
+        # Run the nvidia-smi -q command
+        result = subprocess.run(['nvidia-smi', '--query-remapped-rows=remapped_rows.pending,remapped_rows.failure,remapped_rows.uncorrectable', '--format=csv,noheader'], stdout=subprocess.PIPE)
+
+        if result.returncode != 0:
+            logger.debug(f"Check row remap command exited with error code: {result.returncode}")
+
+    except FileNotFoundError:
+        logger.warning("Skipping Row Remap Test: nvidia-smi command not found")
+        return []
+
+    # Decode the output from bytes to string
+    output = result.stdout.decode('utf-8')
+    logger.debug("Output: {}".format(output))
+    for i, line in enumerate(output.split('\n')):
+        if line == "":
+            continue
+        tmp_data = line.split(",")
+        tmp_data = [x.strip() for x in tmp_data]
+        if tmp_data[0] != "0":
+            logger.debug(f"GPU: {i} - Row Remap Pending: {tmp_data[0]}")
+            remap_issues.append(f"GPU: {i} Row Remap Pending: {tmp_data[0]}")
+        if tmp_data[1] != "0":
+            logger.debug(f"GPU: {i} - Row Remap Failure: {tmp_data[1]}")
+            #remap_issues.append(f"GPU: {i} Row Remap Failure: {tmp_data[1]}")
+        if tmp_data[2] != "0":
+            logger.debug(f"GPU: {i} - Row Remap Uncorrectable: {tmp_data[2]}")
+            if int(tmp_data[2]) > 512:
+                remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable >512: {tmp_data[2]}")
+            else:
+                remap_issues.append(f"GPU: {i} - Row Remap Uncorrectable <512: {tmp_data[2]}")# Check if there are ecc_issues
+
+    if len(remap_issues) == 0:
+        logger.info("GPU Remap Test: Passed")
+    else:
+        logger.warning("GPU Remap Test: Failed")
+
+    return remap_issues
+
+def check_rdma_link_status():
+    status = True
+    devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
+
+    link_issues = []
+    for device in devices:
+        # Run the mlxlink command
+        if not is_user_root():
+            command = ['sudo', 'mlxlink', '-d', device, '-m', '-c', '-e']
+        else:
+            command = ['mlxlink', '-d', device, '-m', '-c', '-e']
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        # Decode the output from bytes to string
+        output = result.stdout.decode('utf-8')
+        stderr = result.stderr.decode('utf-8')
+
+        if stderr and stderr.find("-E-") != -1:
+            stderr = stderr.split("\n")
+            stderr_line = ", ".join(stderr)
+            logger.debug(f"{device}: {stderr_line}")
+            link_issues.append(f"{device}: {stderr[0]}")
+            status = "False"
+            continue
+
+        # Find the line containing "Recommendation"
+        color_pattern = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+        link_state = re.search(r'\nState.*', output).group().split(":")[1].strip()
+        recommendation = re.search(r'Recommendation.*', output).group().split(":")[1].strip()
+        vendor_serial_num = re.search(r'Vendor Serial Number.*', output).group().split(":")[1].strip()
+        nic_fw_version = re.search(r'Firmware Version.*', output).group().split(":")[1].strip()
+        cable_fw_version = re.search(r'FW Version.*', output).group().split(":")[1].strip()
+
+        # Remove hidden characters from the output
+        link_state = re.sub(color_pattern, '', link_state)
+        nic_fw_version = re.sub(color_pattern, '', nic_fw_version)
+        recommendation = re.sub(color_pattern, '', recommendation)
+
+        logger.debug(f"{device}: {vendor_serial_num} - {cable_fw_version} - {nic_fw_version} - {link_state} - {recommendation}")
+
+        # Extract the part after the ":" and print it along with the device name
+        if link_state != "Active":
+            logger.debug(f"{device}: {link_state}")
+            link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {link_state}")
+            status = False
+        if recommendation != "No issue was observed":
+            logger.debug(f"{device}: {recommendation}")
+            link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
+            status = False
+        else:
+            logger.debug(f"{device}: {recommendation}")
+
+    if status:
+        logger.info(f"RDMA Link Status Check: Passed")
+    else:
+        logger.warning(f"RDMA Link Status Check: Failed")
+    return link_issues
+
+def get_host_serial():
+    # Run the shell command
+    if not is_user_root():
+        result = subprocess.run(['sudo', 'dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE)
+    else:
+        result = subprocess.run(['dmidecode', '-s', 'system-serial-number'], stdout=subprocess.PIPE)
+
+    # Decode the output from bytes to string
+    output = result.stdout.decode('utf-8')
+
+    # Return the serial number
+    return output.strip()
+
+def check_bus():
+    # Check to see if any devices have fallen of the bus
+    command = ['lspci', '-v']
+    result = subprocess.run(command, stdout=subprocess.PIPE)
+    output = result.stdout.decode('utf-8')
+    lines = output.split('\n')
+    bus_issues = []
+    for line in lines:
+        if line.find('(rev ff)') != -1:
+            bus_issues.append(line)
+    if len(bus_issues) > 0:
+        logger.error(f"Devices have fallen off the bus")
+    else:
+        logger.info("No devices have fallen off the bus")
+    if len(bus_issues) == 0:
+        logger.info("Bus Check Test: Passed")
+        return(bus_issues)
+    else:
+        logger.warning("Bus Check Test: Failed")
+        return(bus_issues)
+
+def check_gpu_count():
+
+    lspci_expected_results = [  '0f:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                '2d:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                '44:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                '5b:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                '89:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                'a8:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                'c0:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)',
+                                'd8:00.0 3D controller: NVIDIA Corporation Device 2330 (rev a1)'
+                             ]
+
+    # Check the number of GPUs
+    try:
+        result = subprocess.run(['nvidia-smi', '--list-gpus'], stdout=subprocess.PIPE)
+        output = result.stdout.decode('utf-8')
+        lines = output.split('\n')
+        tmp_results = []
+        # remove empty lines
+        lines = [line for line in lines if line]
+        if len(lines) == 8:
+            logger.info("GPU Count Test: Passed")
+        else:
+            logger.warning("GPU Count Test: Failed")
+            tmp_results.append(f"Expected 8 GPUs, found {len(lines)} using nvidia-smi command")
+        return tmp_results
+
+    except FileNotFoundError:
+        try:
+            # Check if lspci is available
+            result = subprocess.run(['lspci', '-v'], stdout=subprocess.PIPE)
+            output = result.stdout.decode('utf-8')
+
+            # Check if the expected results are in the output
+            lines = output.split('\n')
+            tmp_results = []
+            missing_gpus = []
+            for line in lines:
+                if line.find("NVIDIA") != -1 and line.find("2330") != -1:
+                    tmp_results.append(line)
+            if not len(tmp_results) == 8:
+                logger.debug(f"Expected 8 GPUs, found {len(tmp_results)} in lspci output")
+                for line in lspci_expected_results:
+                    if line not in tmp_results:
+                        missing_gpus.append(f"Missing GPU: {line}")
+            if len(tmp_results) == 8:
+                logger.info("GPU Count Test: Passed")
+            else:
+                logger.warning("GPU Count Test: Failed")
+            return missing_gpus
+        except FileNotFoundError:
+            logger.warning("Skipping GPU count test: nvidia-smi and lspci commands not found")
+            return None
+
+def slurm_reason(message):
+    global slurm_drain_reason
+    global slurm_error_count
+    slurm_drain_reason+=(message+"\n")
+    slurm_error_count+=1
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Check H100 setup')
+    parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level default: INFO")
+    parser.add_argument('--bw-test', dest='bw_test', action='store_true', default=False, help='Run GPU bandwidth test (default: False)')
+    parser.add_argument('--bw-test-exe', dest='bw_test_exe', help='Location to cuda-sampels bandwidthTest')
+    parser.add_argument('--lf-interval', dest='lf_interval', default=6, type=int, help='Link flapping interval with no flapping or link down events (default: 6 (hours))')
+    parser.add_argument('-a','--all', dest='run_all', action='store_true', default=False, help='Run all checks (default: False)')
+    parser.add_argument('-slurm','--slurm', dest='slurm', action='store_true', default=False, help='Add a Slurm message')
+    args = parser.parse_args()
+
+    logger.setLevel(args.log_level)
+
+    datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S')
+    logger.info(f"Started H100 setup check at: {datetime_str}")
+    try:
+        oca_version = get_oca_version()
+    except Exception as e:
+        logger.warning(f"Failed to get Oracle Cloud Agent version with error: {e}")
+        oca_version = "Unknown"
+    try:
+        rttcc_issues = check_rttcc_status()
+    except Exception as e:
+        logger.warning(f"Failed to check RTTCC status with error: {e}")
+        rttcc_issues = []
+
+    # Check for ECC errors
+    try:
+        ecc_issues = check_ecc_errors()
+    except Exception as e:
+        logger.warning(f"Failed to check ECC errors with error: {e}")
+        ecc_issues = []
+
+    # Check for row remap errors
+    try:
+        remap_results = check_row_remap_errors()
+    except Exception as e:
+        logger.warning(f"Failed to check row remap errors with error: {e}")
+        remap_results = []
+
+    # Check RDMA link status
+    try:
+        rdma_link_issues = check_rdma_link_status()
+    except Exception as e:
+        logger.warning(f"Failed to check RDMA link status with error: {e}")
+        rdma_link_issues = []
+
+    # Check for RDMA link flapping
+    try:
+        lft = LinkFlappingTest(time_interval=args.lf_interval)
+        lft.get_rdma_link_failures()
+        lft_issues = lft.process_rdma_link_flapping()
+    except Exception as e:
+        logger.warning(f"Failed to check RDMA link flapping with error: {e}")
+        lft_issues = {"failures": [], "link_down": []}
+
+    # Check for GPU Xid errors
+    try:
+        xc = XidChecker()
+        xid_results = xc.check_gpu_xid()
+    except Exception as e:
+        logger.warning(f"Failed to check GPU Xid errors with error: {e}")
+        xid_results = {"status": "None", "results": {}}
+
+    # Check GPU bandwidth
+    bwt_results = None
+    try:
+        if args.bw_test == True or args.run_all == True:
+            if args.bw_test_exe:
+                bwt = BandwidthTest(bw_test_exe=args.bw_test_exe)
+            else:
+                bwt = BandwidthTest()
+            bwt.measure_gpu_bw()
+            bwt_results = bwt.validate_results()
+    except Exception as e:
+        logger.warning(f"Failed to check GPU bandwidth with error: {e}")
+        bwt_results = None
+
+    # Check the bus
+    try:
+        bus_results = check_bus()
+    except Exception as e:
+        logger.warning(f"Failed to check the bus with error: {e}")
+        bus_results = None
+
+    # Check the number of GPUs
+    try:
+        gpu_results = check_gpu_count()
+    except Exception as e:
+        logger.warning(f"Failed to check the number of GPUs with error: {e}")
+        gpu_results = None
+
+    # Summarize the results
+    try:
+        host_serial = get_host_serial()
+    except Exception as e:
+        logger.warning(f"Failed to get host serial number with error: {e}")
+        host_serial = "Unknown"
+
+    slurm_drain_reason = ""
+    slurm_error_count = 0
+
+    logger.info(f"--------- Summary of H100 setup check for {host_serial} ---------")
+    if oca_version < "1.39.0":
+        logger.error(f"Oracle Cloud Agent: {oca_version} needs to be updated to 1.39.0 or higher")
+        slurm_reason("OCA version Error")
+    if len(rttcc_issues) > 0:
+        logger.error(f"RTTCC issues: {rttcc_issues}")
+        slurm_reason("RTTCC Error")
+    if len(ecc_issues) > 0:
+        ecc_error=False
+        for issue in ecc_issues:
+            if "Skipped" in issue:
+                logger.warning(f"{host_serial} - {issue}")
+            else:
+                if "Aggregate" in issue:
+                    logger.warning(f"{host_serial} - ECC issues: {issue}")
+                else:
+                    logger.error(f"{host_serial} - ECC issues: {issue}")
+                    ecc_error=True
+        if ecc_error:
+            slurm_reason("ECC Error")
+    if len(remap_results) > 0:
+        remap_error=False
+        for issue in remap_results:
+            if "<512" in issue:
+                logger.warning(f"{host_serial} - {issue}")
+            else:
+                logger.error(f"{host_serial} - {issue}")
+                remap_error=True
+        if remap_error:
+            slurm_reason("Remap Error")
+    if xid_results["status"] == "Failed":
+        for xid in xid_results["results"]:
+            for pci in xid_results["results"][xid]["results"]:
+                logger.error(f"{host_serial} - GPU Xid {xid} device: {pci}, {xid_results['results'][xid]['description']}")
+                slurm_reason("XID Error")
+    if len(rdma_link_issues) > 0:
+        for issue in rdma_link_issues:
+            logger.error(f"{host_serial} - RDMA link issues: {issue}")
+            slurm_reason("RDMA Link Error")
+    if len(lft_issues["failures"]) > 0 or len(lft_issues["link_down"]) > 0:
+        if len(lft_issues["failures"]) > 0:
+            for issue in lft_issues["failures"]:
+                logger.error(f"{host_serial} - RDMA link flapping issues: {issue}")
+                slurm_reason("RDMA Link Flapping Error")
+        if len(lft_issues["link_down"]) > 0:
+            for issue in lft_issues["link_down"]:
+                logger.error(f"{host_serial} - RDMA link down issues: {issue}")
+                slurm_reason("RDMA Link Down Error")
+    if bwt_results != None:
+        if bwt_results["status"] == "Failed":
+            for issue in bwt_results["issues"]:
+                logger.error(f"{host_serial} - GPU bandwidth issues: {issue}")
+                slurm_reason("GPU Bwt Error")
+    if bus_results:
+        logger.error(f"{host_serial} - Bus issues: {bus_results}")
+        slurm_reason("GPU Bus Error")
+    if gpu_results:
+        logger.error(f"{host_serial} - Missing GPU(s): {gpu_results}")
+        slurm_reason("Missing GPU Error")
+
+    datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S')
+    logger.info(f"Finished H100 setup check at: {datetime_str}")
+
+    if slurm_error_count > 0 and args.slurm:
+        print("Healthcheck:: "+slurm_drain_reason[:-1])
\ No newline at end of file
diff --git a/playbooks/roles/healthchecks/files/gpu_bw_test.py b/playbooks/roles/healthchecks/files/gpu_bw_test.py
new file mode 100644
index 00000000..369556c3
--- /dev/null
+++ b/playbooks/roles/healthchecks/files/gpu_bw_test.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+
+import argparse
+import subprocess
+import os
+import socket
+import time
+import json
+from shared_logging import logger
+import re
+
+
+class BandwidthTest:
+    def __init__(self, iteration=1, size=32000000, bw_test_exe="/opt/oci-hpc/cuda-samples/bin/x86_64/linux/release/bandwidthTest"):
+        self.iteration = iteration
+        self.size = size
+        self.bw_test_exe = bw_test_exe
+        self.results = None
+        self.dtoh_threshold = 52.0
+        self.htod_threshold = 52.0
+
+    def get_numa_nodes(self):
+        result = subprocess.run(['numactl', '-H'], stdout=subprocess.PIPE)
+        output = result.stdout.decode('utf-8')
+        filtered_output = [line for line in output.split('\n') if line.startswith('available:')]
+        return int(filtered_output[0].split()[1].strip())
+
+    def get_gpus(self):
+        result = subprocess.run(['nvidia-smi', '-L'], stdout=subprocess.PIPE)
+        output = result.stdout.decode('utf-8')
+        filtered_output = [line for line in output.split('\n') if line.startswith('GPU')]
+        return len(filtered_output)
+
+    def measure_gpu_bw(self):
+        numas = 2
+        gpus = 8
+        iterations = 1
+        size = "32000000"
+
+        gpus = self.get_gpus()
+        numas = self.get_numa_nodes()
+        gpus_per_numa = gpus // numas
+
+        logger.debug("GPUs: {}".format(gpus))
+        logger.debug("NUMAs: {}".format(numas))
+        logger.debug("GPUs per NUMA: {}".format(gpus_per_numa))
+
+        logger.debug("Iteration: Device: DtoH : HtoD")
+        hostname = socket.gethostname()
+        results = {"gpus": {}, "host": hostname}
+
+        # Check if any processes are running on the GPUs before running the test
+        result = subprocess.run(["nvidia-smi", "-q", "-d", "PIDS"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+        # Define the regular expression pattern for the GPU ID and the Processes
+        pattern = r'\nGPU\s(.*)\s+Processes\s+:\s+(.*)'
+
+        # Find all matches in the output
+        matches = re.findall(pattern, result.stdout)
+
+        # For each match, extract the GPU ID and the number of processes
+        gpu_idle_count = 0
+        for match in matches:
+            gpu_id, processes = match
+            # If processes is 'None', set it to 0
+            if processes == 'None':
+                gpu_idle_count += 1
+            else:
+                logger.debug("GPU {} has processes running on it".format(gpu_id))
+            
+
+        logger.debug("GPU Idle Count: {}".format(gpu_idle_count))
+        if gpu_idle_count != 8:
+            logger.error("GPU processes are running on the host. Please make sure no processes are running on the GPU before you re-test")
+            self.results = None
+            return self.results
+
+        for i in range(iterations):
+            for device in range(gpus):
+                os.environ["CUDA_VISIBLE_DEVICES"] = str(device)
+                logger.debug("ENV: {}".format(os.environ["CUDA_VISIBLE_DEVICES"]))
+                logger.debug("Iteration: {} Device: {} gpus_per_numa: {}".format(i, device, gpus_per_numa))
+                logger.debug("CMD: {}".format(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"]))
+                result = subprocess.run(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-dtoh"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+                logger.debug("Output: {}".format(result.stdout))
+                logger.debug("Error: {}".format(result.stderr))
+                if result.stdout.find(size) != -1:
+                    result = result.stdout.split("\n")
+                    tmp = [x for x in result if size in x]
+                    tmp = tmp[0].split()
+                    dtoh = float(tmp[1])
+
+                    result = subprocess.run(["numactl", "-N" + str(device // gpus_per_numa), "-m" + str(device // gpus_per_numa), self.bw_test_exe, "-htod"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+                    result = result.stdout.split("\n")
+                    tmp = [x for x in result if size in x]
+                    tmp = tmp[0].split()
+                    htod = float(tmp[1])
+                else:
+                    dtoh = -1.0
+                    htod = -1.0
+
+                if device not in results["gpus"]:
+                    results["gpus"][device] = {"dtoh": [dtoh], "htod": [htod]}
+                else:
+                    results["gpus"][device]["dtoh"].append(dtoh)
+                    results["gpus"][device]["htod"].append(htod)
+
+            logger.debug(str(i) + " : " +str(device) + " : " + str(dtoh) + " : " + str(htod))
+        
+            if i > 1 and i != iterations - 1:
+                 # Sleep for 5 seconds and rerun
+                 time.sleep(5)
+        
+        logger.debug(json.dumps(results))
+        self.results = results
+
+    def validate_results(self):
+        gpu_issues = {"status": "Passed", "issues": []}
+        if self.results == None:
+            gpu_issues["issues"].append("GPU bandwidth test did not run since processes are running on the GPU")
+            gpu_issues["status"] = "Failed"
+            return gpu_issues
+        status = True
+        for device in self.results["gpus"]:
+            dtoh = self.results["gpus"][device]["dtoh"]
+            htod = self.results["gpus"][device]["htod"]
+            dtoh_avg = sum(dtoh) / len(dtoh)
+            htod_avg = sum(htod) / len(htod)
+            logger.debug("Device: {} DtoH: {} HtoD: {}".format(device, dtoh_avg, htod_avg))
+            if dtoh_avg < self.dtoh_threshold:
+                logger.debug("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold))
+                gpu_issues["issues"].append("Device: {} DtoH: {} is below threshold: {}".format(device, dtoh_avg, self.dtoh_threshold))
+                gpu_issues["status"] = "Failed"
+            if htod_avg < self.htod_threshold:
+                logger.debug("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold))
+                gpu_issues["issues"].append("Device: {} HtoD: {} is below threshold: {}".format(device, htod_avg, self.htod_threshold))
+                gpu_issues["status"] = "Failed"
+        if gpu_issues["status"] == "Passed":
+            logger.info("GPU bandwidth test passed")
+        return gpu_issues
+            
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Run GPU bandwidth test')
+    parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level default: INFO")
+    parser.add_argument('-i', dest='iterations', default='1', help='Number of iterations to run Ex. -i 3')
+    parser.add_argument('-s', dest='size', default='32000000', help='Message size to run Ex. -s 32000000')
+    parser.add_argument('--bw-test-exe', dest='bw_test_exe', default='/opt/oci-hpc/cuda-samples/bin/x86_64/linux/release/bandwidthTest', help='Path to the bw_test executable')
+    args = parser.parse_args()
+
+    logger.setLevel(args.log_level)
+    if args.iterations != 'NONE':
+        iterations = int(args.iterations)
+    if args.size != 'NONE':
+        size = args.size
+    if args.bw_test_exe != 'NONE':
+        bw_test_exe = args.bw_test_exe
+
+    bwt = BandwidthTest(iteration=iterations, size=size, bw_test_exe=bw_test_exe)
+    bwt.measure_gpu_bw()
+    bwt_results = bwt.validate_results()
+    if bwt_results["status"] == "Passed":
+        logger.info("GPU bandwidth test passed")
+    else:
+        logger.error("GPU bandwidth test failed")
+        for issue in bwt_results["issues"]:
+            logger.error(issue)    
+
diff --git a/playbooks/roles/healthchecks/files/rdma_link_flapping.py b/playbooks/roles/healthchecks/files/rdma_link_flapping.py
new file mode 100644
index 00000000..425ec54e
--- /dev/null
+++ b/playbooks/roles/healthchecks/files/rdma_link_flapping.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import time
+import datetime
+import re
+import argparse
+import socket
+import subprocess
+from shared_logging import logger
+
+
+class LinkFlappingTest:
+    def __init__(self, time_interval=6):
+        self.results = None
+        self.time_interval = int(time_interval)
+        self.link_data = None
+
+            
+        # Check if the log file exists
+        msg_file = "/var/log/messages"
+        if not os.path.exists(msg_file):
+            msg_file = "/var/log/syslog"
+        self.log_file = msg_file
+
+    def get_rdma_link_failures(self):
+
+        pattern  = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+wpa_supplicant(?:\[\d+\])?: (\w+): CTRL-EVENT-EAP-FAILURE EAP authentication failed"
+        pattern2 = r"(\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2})\s+\S+\s+kernel: (?:\[\d+\.\d+\]\s)?mlx5_core \S+ (\w+): Link down"
+        
+        self.link_data = {}
+        with open(self.log_file, "r") as f:
+            for line in f:
+                match = re.search(pattern, line)
+                if match:
+                    time_str = match.group(1)
+                    interface = match.group(2)
+                    logger.debug(f"time: {time_str}, interface: {interface}")
+                    if interface not in self.link_data:
+                        self.link_data[interface] = {"failures": [time_str], "link_down": []}
+                    else:
+                        self.link_data[interface]["failures"].append(time_str)
+
+                
+                match = re.search(pattern2, line)
+                if match:
+                    time_str = match.group(1)
+                    interface = match.group(2)
+                    logger.debug(f"time: {time_str}, interface: {interface}")
+                    if interface not in self.link_data:
+                        self.link_data[interface] = {"failures": [], "link_down": [time_str]}
+                    else:
+                        self.link_data[interface]["link_down"].append(time_str)
+                        
+        logger.debug("Link Data: {}".format(self.link_data))
+        return self.link_data
+
+    def process_rdma_link_flapping(self):
+
+        link_issues = {"failures": [], "link_down": []}
+
+        # Get the time stamp when the host came up
+        bootup_time = subprocess.run(['uptime', '-s'], stdout=subprocess.PIPE)
+        bootup_time = bootup_time.stdout.decode('utf-8').strip()
+        bootup_time_str = datetime.datetime.strptime(bootup_time, "%Y-%m-%d %H:%M:%S")
+        bootup_time_sec = int(time.mktime(bootup_time_str.timetuple()))
+        bootup_time_grace_period = bootup_time_sec + 1800
+
+        status = 0
+        if len(self.link_data) >= 0:
+            current_date = datetime.datetime.now()
+            current_date_str = current_date.strftime("%Y-%b-%d %H:%M:%S")
+            current_date_sec = int(time.mktime(datetime.datetime.strptime(current_date_str, "%Y-%b-%d %H:%M:%S").timetuple()))
+            
+            link_failures = False
+            for interface in self.link_data:
+                if len(self.link_data[interface]["failures"]) > 0:
+                    link_failures = True
+                    logger.debug(f"{interface}: {len(self.link_data[interface]['failures'])} RDMA link failure entries in {self.log_file}")
+                    logger.debug(f"{interface}: {self.link_data[interface]['failures']}")        
+                last_date_failure_str = None
+
+                if len(self.link_data[interface]["failures"]) > 0:
+                    last_date_failure_str = self.link_data[interface]["failures"][-1]
+                    last_date_failure = datetime.datetime.strptime(last_date_failure_str, "%b %d %H:%M:%S")
+
+                    # Compare the month of the last failure date with the current month
+                    if last_date_failure.month > current_date.month:
+                        # If the last failure month is greater than the current month, subtract one from the current year
+                        last_date_failure = last_date_failure.replace(year=current_date.year - 1)
+                    else:
+                        # Otherwise, set the year of the last failure date to the current year
+                        last_date_failure = last_date_failure.replace(year=current_date.year)
+
+                    # Convert the last failure date to seconds since the epoch
+                    last_date_failure_sec = int(time.mktime(last_date_failure.timetuple()))
+                
+                if last_date_failure_str != None and last_date_failure_str != current_date_str:
+                    diff_secs = current_date_sec - last_date_failure_sec
+                    diff_hours = diff_secs // (60 * 60)
+                    logger.debug(f"RDMA link ({interface}) failed  {diff_hours} hours ago")
+
+                    logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}")
+                    if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period:
+                        logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})")
+                        link_issues["failures"].append(f"{interface}: {len(self.link_data[interface]['failures'])}")
+                        status = -1
+
+            for interface in self.link_data:
+                if len(self.link_data[interface]["link_down"]) > 0:
+                    logger.debug(f"{interface}: {len(self.link_data[interface]['link_down'])} RDMA link down entries in {self.log_file}")
+                    logger.debug(f"{interface}: {self.link_data[interface]['link_down']}")
+                last_date_down_str = None
+
+                if len(self.link_data[interface]["link_down"]) > 0:
+                        last_date_down_str = self.link_data[interface]["link_down"][-1]
+                        last_date_down = datetime.datetime.strptime(last_date_down_str, "%b %d %H:%M:%S")
+
+                        # Compare the month of the last failure date with the current month
+                        if last_date_down.month > current_date.month:
+                            # If the last failure month is greater than the current month, subtract one from the current year
+                            last_date_down = last_date_down.replace(year=current_date.year - 1)
+                        else:
+                            # Otherwise, set the year of the last failure date to the current year
+                            last_date_down = last_date_down.replace(year=current_date.year)
+
+                        # Convert the last failure date to seconds since the epoch
+                        last_date_down_sec = int(time.mktime(last_date_down.timetuple()))
+
+
+                if last_date_down_str != None and last_date_down_str != current_date_str:
+                    diff_secs = current_date_sec - last_date_down_sec
+                    diff_hours = diff_secs // (60 * 60)
+                    logger.debug(f"RDMA link ({interface}) down  {diff_hours} hours ago")
+                    
+                    logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}")
+                    if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period:
+                        logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}")
+                        link_issues["link_down"].append(f"{interface}: {len(self.link_data[interface]['link_down'])}")
+                        status = -2
+            if status == -1:
+                logger.debug(f"One or more RDMA link flapping events within the past {self.time_interval} hours")
+            if status == -2:
+                logger.debug(f"One or more RDMA link down events within the past {self.time_interval} hours")
+
+        else:
+            logger.info("No RDMA link failures entry in /var/log/messages")
+        if status == 0:    
+            logger.info("RDMA link flapping/down test: Passed")
+        else:
+            logger.warning("RDMA link flapping/down test: Failed")
+        return link_issues
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Process RDMA link flapping data")
+    parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level")
+    args = parser.parse_args()
+
+    logger.setLevel(args.log_level)
+
+    auth_failure_file = "/tmp/last_auth_failure_date"
+    msg_file = "/var/log/messages"
+    if not os.path.exists(msg_file):
+        msg_file = "/var/log/syslog"
+    time_interval_hours = 6
+    lft = LinkFlappingTest(time_interval=time_interval_hours)
+    link_data = lft.get_rdma_link_failures()
+    lft.process_rdma_link_flapping()
diff --git a/playbooks/roles/healthchecks/files/shared_logging.py b/playbooks/roles/healthchecks/files/shared_logging.py
new file mode 100644
index 00000000..af87bc2d
--- /dev/null
+++ b/playbooks/roles/healthchecks/files/shared_logging.py
@@ -0,0 +1,5 @@
+#!/usr/bin/env python3
+
+import logging
+logging.basicConfig(level="INFO", format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger('nhc')
diff --git a/playbooks/roles/healthchecks/files/xid_checker.py b/playbooks/roles/healthchecks/files/xid_checker.py
new file mode 100644
index 00000000..eaa6360b
--- /dev/null
+++ b/playbooks/roles/healthchecks/files/xid_checker.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+
+import argparse
+from shared_logging import logger
+import subprocess
+import sys
+import re
+
+class XidChecker:
+    def __init__(self, dmesg_cmd="dmesg", time_interval=60):
+        self.dmesg_cmd = dmesg_cmd
+        self.results = {}
+
+
+        # Check for the following GPU Xid errors in dmesg
+        self.XID_EC = {
+                "1": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
+                "2": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
+                "3": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
+                "4": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
+                "5": {"description": "Unused", "severity": "Critical"},
+                "6": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
+                "7": {"description": "Invalid or corrupted push buffer address", "severity": "Critical"},
+                "8": {"description": "GPU stopped processing", "severity": "Critical"},
+                "9": {"description": "Driver error programming GPU", "severity": "Critical"},
+                "10": {"description": "Unused", "severity": "Critical"},
+                "11": {"description": "Invalid or corrupted push buffer stream", "severity": "Critical"},
+                "12": {"description": "Driver error handling GPU exception", "severity": "Critical"},
+                "13": {"description": "Graphics Engine Exception", "severity": "Critical"},
+                "14": {"description": "Unused", "severity": "Warn"},
+                "15": {"description": "Unused", "severity": "Warn"},
+                "16": {"description": "Display engine hung", "severity": "Warn"},
+                "17": {"description": "Unused", "severity": "Warn"},
+                "18": {"description": "Bus mastering disabled in PCI Config Space", "severity": "Warn"},
+                "19": {"description": "Display Engine error", "severity": "Warn"},
+                "20": {"description": "Invalid or corrupted Mpeg push buffer", "severity": "Warn"},
+                "21": {"description": "Invalid or corrupted Motion Estimation push buffer", "severity": "Warn"},
+                "22": {"description": "Invalid or corrupted Video Processor push buffer", "severity": "Warn"},
+                "23": {"description": "Unused", "severity": "Warn"},
+                "24": {"description": "GPU semaphore timeout", "severity": "Warn"},
+                "25": {"description": "Invalid or illegal push buffer stream", "severity": "Warn"},
+                "26": {"description": "Framebuffer timeout", "severity": "Warn"},
+                "27": {"description": "Video processor exception", "severity": "Warn"},
+                "28": {"description": "Video processor exception", "severity": "Warn"},
+                "29": {"description": "Video processor exception", "severity": "Warn"},
+                "30": {"description": "GPU semaphore access error", "severity": "Warn"},
+                "31": {"description": "GPU memory page fault", "severity": "Critical"},    
+                "32": {"description": "Invalid or corrupted push buffer stream", "severity": "Warn"},
+                "33": {"description": "Internal micro-controller error", "severity": "Warn"},
+                "34": {"description": "Video processor exception", "severity": "Warn"},
+                "35": {"description": "Video processor exception", "severity": "Warn"},
+                "36": {"description": "Video processor exception", "severity": "Warn"},
+                "37": {"description": "Driver firmware error", "severity": "Warn"},
+                "38": {"description": "Driver firmware error", "severity": "Warn"},
+                "39": {"description": "Unused", "severity": "Warn"},
+                "40": {"description": "Unused", "severity": "Warn"},
+                "41": {"description": "Unused", "severity": "Warn"},
+                "42": {"description": "Video processor exception", "severity": "Warn"},
+                "43": {"description": "GPU stopped processing", "severity": "Warn"},
+                "44": {"description": "Graphics Engine fault during context switch", "severity": "Warn"},
+                "45": {"description": "Preemptive cleanup, due to previous errors -- Most likely to see when running multiple cuda applications and hitting a DBE", "severity": "Warn"},
+                "46": {"description": "GPU stopped processing", "severity": "Warn"},
+                "47": {"description": "Video processor exception", "severity": "Warn"},
+                "48": {"description": "Double Bit ECC Error", "severity": "Critical"}, 
+                "49": {"description": "Unused", "severity": "Warn"},
+                "50": {"description": "Unused", "severity": "Warn"},
+                "51": {"description": "Unused", "severity": "Warn"},
+                "52": {"description": "Unused", "severity": "Warn"},
+                "53": {"description": "Unused", "severity": "Warn"},
+                "54": {"description": "Auxiliary power is not connected to the GPU board", "severity": "Warn"},
+                "55": {"description": "Unused", "severity": "Warn"},
+                "56": {"description": "Display Engine error", "severity": "Critical"},
+                "57": {"description": "Error programming video memory interface", "severity": "Critical"},
+                "58": {"description": "Unstable video memory interface detected", "severity": "Critical"},
+                "59": {"description": "Internal micro-controller error (older drivers)", "severity": "Warn"},
+                "60": {"description": "Video processor exception", "severity": "Warn"},
+                "61": {"description": "Internal micro-controller breakpoint/warning (newer drivers)", "severity": "Warn"},
+                "62": {"description": "Internal micro-controller halt", "severity": "Critical"},
+                "63": {"description": "ECC page retirement or row remapping recording event", "severity": "Critical"},
+                "64": {"description": "ECC page retirement or row remapper recording failure", "severity": "Critical"},
+                "65": {"description": "Video processor exception", "severity": "Critical"},
+                "66": {"description": "Illegal access by driver", "severity": "Warn"},
+                "67": {"description": "Illegal access by driver", "severity": "Warn"},
+                "68": {"description": "NVDEC0 Exception", "severity": "Critical"},
+                "69": {"description": "Graphics Engine class error", "severity": "Critical"},
+                "70": {"description": "CE3: Unknown Error", "severity": "Warn"},
+                "71": {"description": "CE4: Unknown Error", "severity": "Warn"},
+                "72": {"description": "CE5: Unknown Error", "severity": "Warn"},
+                "73": {"description": "NVENC2 Error", "severity": "Critical"},
+                "74": {"description": "NVLINK Error", "severity": "Critical"},
+                "75": {"description": "CE6: Unknown Error", "severity": "Warn"},
+                "76": {"description": "CE7: Unknown Error", "severity": "Warn"},
+                "77": {"description": "CE8: Unknown Error", "severity": "Warn"},
+                "78": {"description": "vGPU Start Error", "severity": "Warn"},
+                "79": {"description": "GPU has fallen off the bus", "severity": "Critical"},
+                "80": {"description": "Corrupted data sent to GPU", "severity": "Critical"},
+                "81": {"description": "VGA Subsystem Error", "severity": "Critical"},
+                "82": {"description": "NVJPGO Error", "severity": "Warn"},
+                "83": {"description": "NVDEC1 Error", "severity": "Warn"},
+                "84": {"description": "NVDEC2 Error", "severity": "Warn"},
+                "85": {"description": "CE9: Unknown Error", "severity": "Warn"},
+                "86": {"description": "OFA Exception", "severity": "Warn"},
+                "87": {"description": "Reserved", "severity": "Warn"},
+                "88": {"description": "NVDEC3 Error", "severity": "Warn"},
+                "89": {"description": "NVDEC4 Error", "severity": "Warn"},
+                "90": {"description": "Reserved", "severity": "Warn"},
+                "91": {"description": "Reserved", "severity": "Warn"},
+                "92": {"description": "High single-bit ECC error rate", "severity": "Critical"},
+                "93": {"description": "Non-fatal violation of provisioned InfoROM wear limit", "severity": "Warn"},
+                "94": {"description": "Contained ECC error", "severity": "Critical"},
+                "95": {"description": "Uncontained ECC error", "severity": "Critical"},
+                "96": {"description": "NVDEC5 Error", "severity": "Warn"},
+                "97": {"description": "NVDEC6 Error", "severity": "Warn"},
+                "98": {"description": "NVDEC7 Error", "severity": "Warn"},
+                "99": {"description": "NVJPG1 Error", "severity": "Warn"},
+                "100": {"description": "NVJPG2 Error", "severity": "Warn"},
+                "101": {"description": "NVJPG3 Error", "severity": "Warn"},
+                "102": {"description": "NVJPG4 Error", "severity": "Warn"},
+                "103": {"description": "NVJPG5 Error", "severity": "Warn"},
+                "104": {"description": "NVJPG6 Error", "severity": "Warn"},
+                "105": {"description": "NVJPG7 Error", "severity": "Warn"},
+                "106": {"description": "SMBPBI Test Message", "severity": "Warn"},
+                "107": {"description": "SMBPBI Test Message Silent", "severity": "Warn"},
+                "108": {"description": "Reserved", "severity": "Warn"},
+                "109": {"description": "Context Switch Timeout Error", "severity": "Critical"},
+                "110": {"description": "Security Fault Error", "severity": "Warn"},
+                "111": {"description": "Display Bundle Error Event", "severity": "Warn"},
+                "112": {"description": "Display Supervisor Error", "severity": "Warn"},
+                "113": {"description": "DP Link Training Error", "severity": "Warn"},
+                "114": {"description": "Display Pipeline Underflow Error", "severity": "Warn"},
+                "115": {"description": "Display Core Channel Error", "severity": "Warn"},
+                "116": {"description": "Display Window Channel Error", "severity": "Warn"},
+                "117": {"description": "Display Cursor Channel Error", "severity": "Warn"},
+                "118": {"description": "Display Pixel Pipeline Error", "severity": "Warn"},
+                "119": {"description": "GSP RPC Timeout", "severity": "Critical"},
+                "120": {"description": "GSP Error", "severity": "Critical"},
+                "121": {"description": "C2C Link Error", "severity": "Critical"},
+                "122": {"description": "SPI PMU RPC Read Failure", "severity": "Warn"},
+                "123": {"description": "SPI PMU RPC Write Failure", "severity": "Warn"},
+                "124": {"description": "SPI PMU RPC Erase Failure", "severity": "Warn"},
+                "125": {"description": "Inforom FS Failure", "severity": "Warn"},
+                "126": {"description": "Reserved", "severity": "Warn"},
+                "127": {"description": "Reserved", "severity": "Warn"},
+                "128": {"description": "Reserved", "severity": "Warn"},
+                "129": {"description": "Reserved", "severity": "Warn"},
+                "130": {"description": "Reserved", "severity": "Warn"},
+                "131": {"description": "Reserved", "severity": "Warn"},
+                "132": {"description": "Reserved", "severity": "Warn"},
+                "133": {"description": "Reserved", "severity": "Warn"},
+                "134": {"description": "Reserved", "severity": "Warn"},
+                "135": {"description": "Reserved", "severity": "Warn"},
+                "136": {"description": "Reserved", "severity": "Warn"},
+                "137": {"description": "Reserved", "severity": "Warn"},
+                "138": {"description": "Reserved", "severity": "Warn"},
+                "139": {"description": "Reserved", "severity": "Warn"},
+                "140": {"description": "Unrecovered ECC Error", "severity": "Warn"},
+                "141": {"description": "Reserved", "severity": "Warn"},
+                "142": {"description": "Reserved", "severity": "Warn"},
+                "143": {"description": "GPU Initialization Failure", "severity": "Warn"}
+                }
+
+    def check_gpu_xid(self):
+        status = "Pass"
+        dmesg_output = subprocess.check_output([self.dmesg_cmd]).decode("utf-8")
+        if "NVRM: Xid" in dmesg_output:
+            for XID in self.XID_EC.keys():
+                logger.debug(f"Checking for GPU Xid {XID} error in dmesg")
+                
+                matches = re.findall(f"NVRM: Xid \(PCI:(.*?): {XID},", dmesg_output)
+                tmp_dict = {}
+                for match in matches:
+                    if match not in tmp_dict:
+                        tmp_dict[match] = 1
+                    else:
+                        tmp_dict[match] = tmp_dict[match] + 1
+                for x in tmp_dict.keys():
+                    logger.info(f"{XID} : count: {tmp_dict[x]}, {self.XID_EC[XID]['description']} - PCI: {x}")
+                if not matches:
+                    logger.debug(f"No GPU Xid {XID} error found in dmesg")
+                if tmp_dict != {}:
+                    if self.XID_EC[XID]['severity'] == "Critical":
+                        status = "Failed"
+                    self.results[XID] = {"results": tmp_dict, "description": self.XID_EC[XID]['description']}
+        else:
+            logger.info("Xid Check: Passed")
+        return {"status": status, "results": self.results}
+
+
+if __name__ == '__main__':
+    # Argument parsing
+    parser = argparse.ArgumentParser(description='Check for GPU Xid errors.')
+    parser.add_argument('--dmesg_cmd', default='dmesg', help='Dmesg file to check. Default is dmesg.')
+    args = parser.parse_args()
+
+
+    logger.debug(f"Using dmesg command: {args.dmesg_cmd}")
+    
+    xc = XidChecker(dmesg_cmd=args.dmesg_cmd)
+    results = xc.check_gpu_xid()
+    logger.debug("Status: {}, Results: {}".format(results["status"], results["results"]))
diff --git a/playbooks/roles/healthchecks/tasks/main.yml b/playbooks/roles/healthchecks/tasks/main.yml
new file mode 100755
index 00000000..7ed13754
--- /dev/null
+++ b/playbooks/roles/healthchecks/tasks/main.yml
@@ -0,0 +1,22 @@
+- name: Create systemd unit dirs
+  become: true
+  file:
+    name: '/opt/oci-hpc/healthchecks'
+    state: directory
+    owner: '{{ ansible_user }}'
+    group: '{{ ansible_user }}'
+
+- name: Copy files
+  become: true
+  copy: 
+    src: '{{ item }}'
+    dest: '/opt/oci-hpc/healthchecks/{{ item }}'
+    force: no
+    owner: '{{ ansible_user }}'
+    group: '{{ ansible_user }}'
+  with_items: 
+    - check_h100_setup.py
+    - gpu_bw_test.py
+    - rdma_link_flapping.py
+    - xid_checker.py
+    - shared_logging.py
\ No newline at end of file
diff --git a/playbooks/roles/slurm/files/healthchecks.sh b/playbooks/roles/slurm/files/healthchecks.sh
new file mode 100644
index 00000000..a2a9cec0
--- /dev/null
+++ b/playbooks/roles/slurm/files/healthchecks.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
+if [ "${shape}" = \"BM.GPU.H100.8\" ]
+then
+    sudo python3 /opt/oci-hpc/healthchecks/check_h100_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1
+    DRAIN_MSG=`cat /tmp/latest_healthcheck.log | grep "Healthcheck::"`
+    if [ "$DRAIN_MSG" != "" ]
+    then
+        scontrol update nodename=`hostname` state=drain reason="${DRAIN_MSG}"
+    fi
+fi
diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml
index 9f0d0729..755bc51f 100755
--- a/playbooks/roles/slurm/tasks/common.yml
+++ b/playbooks/roles/slurm/tasks/common.yml
@@ -192,4 +192,22 @@
 
 - name: Include pyxis prolog files
   include_tasks: common_pmix.yml
-  when: ansible_os_family == 'RedHat'
\ No newline at end of file
+  when: ansible_os_family == 'RedHat'
+
+- name: Ensure prolog directory exists
+  become: true
+  file:
+    path: "{{ slurm_conf_path }}/prolog.d"
+    state: directory
+    owner: root
+    group: root
+  when: healthchecks|bool
+
+- name: copy healthchecks
+  become: true
+  copy: 
+    src: healthchecks.sh
+    dest: "{{ slurm_conf_path }}/prolog.d/healthchecks.sh"
+    owner: root
+    group: root
+    mode: '0755'
\ No newline at end of file
diff --git a/playbooks/roles/slurm/tasks/common_pyxis.yml b/playbooks/roles/slurm/tasks/common_pyxis.yml
index a200ad54..dbad6c54 100644
--- a/playbooks/roles/slurm/tasks/common_pyxis.yml
+++ b/playbooks/roles/slurm/tasks/common_pyxis.yml
@@ -7,13 +7,21 @@
 - set_fact: 
     enroot_top_path_checked: "{{enroot_top_path}}"
   when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" 
+  
+- name: Ensure prolog directory exists
+  become: true
+  file:
+    path: "{{ slurm_conf_path }}/prolog.d"
+    state: directory
+    owner: root
+    group: root
 
 - name: copy files
   become: true
   become_method: sudo
   template:
-    src: prolog.sh.j2
-    dest: "{{ slurm_conf_path }}/prolog.sh"
+    src: pyxis.sh.j2
+    dest: "{{ slurm_conf_path }}/prolog.d/pyxis.sh"
     owner: root
     group: root
     mode: 0755
diff --git a/playbooks/roles/slurm/templates/prolog.sh.j2 b/playbooks/roles/slurm/templates/pyxis.sh.j2
similarity index 100%
rename from playbooks/roles/slurm/templates/prolog.sh.j2
rename to playbooks/roles/slurm/templates/pyxis.sh.j2
diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2
index 3bb57372..19bf5502 100755
--- a/playbooks/roles/slurm/templates/slurm.conf.j2
+++ b/playbooks/roles/slurm/templates/slurm.conf.j2
@@ -12,8 +12,8 @@ SlurmdPidFile=/var/run/slurmd.pid
 SlurmdPort=6818
 SlurmdSpoolDir=/var/spool/slurmd
 SlurmUser=slurm
-{% if pyxis|bool %}
-Prolog=/etc/slurm/prolog.sh
+{% if pyxis|bool or healthchecks|bool%}
+Prolog=/etc/slurm/prolog.d/*
 {% endif %}
 SlurmctldLogFile=/var/log/slurm/slurmctld.log
 SlurmdLogFile=/var/log/slurm/slurmd.log
diff --git a/playbooks/site.yml b/playbooks/site.yml
index e869fd36..72cfcca0 100644
--- a/playbooks/site.yml
+++ b/playbooks/site.yml
@@ -64,6 +64,8 @@
       when: cluster_network|bool and not use_compute_agent|default(false)|bool
     - include_role: 
         name: nvidia_peermem
+    - include_role: 
+        name: healthchecks
 
 - hosts: controller
   become: true
diff --git a/schema.yaml b/schema.yaml
index e48577fd..033117fd 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -181,6 +181,7 @@ variableGroups:
       - ${pyxis}
       - ${pam}
       - ${sacct_limits}
+      - ${healthchecks}
 
   - title: "Hidden"
     variables:
@@ -1196,6 +1197,13 @@ variables:
     description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs if the right limits are not set"
     visible: ${slurm}
 
+  healthchecks:
+    type: boolean
+    title: "Turn on Healthchecks for GPU nodes"
+    default: true
+    description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state"
+    visible: ${slurm}
+ 
   monitoring:
     type: boolean
     title: "Install HPC Cluster Monitoring Tools"
diff --git a/slurm_ha.tf b/slurm_ha.tf
index 36dc60db..b6d9f72a 100644
--- a/slurm_ha.tf
+++ b/slurm_ha.tf
@@ -251,7 +251,8 @@ resource "null_resource" "cluster_backup" {
       region = var.region,
       tenancy_ocid = var.tenancy_ocid,
       api_fingerprint = var.api_fingerprint,
-      api_user_ocid = var.api_user_ocid       
+      api_user_ocid = var.api_user_ocid,
+      healthchecks = var.healthchecks    
       })
 
     destination   = "/opt/oci-hpc/playbooks/inventory"
@@ -399,7 +400,8 @@ resource "null_resource" "cluster_backup" {
       virt_instr = var.virt_instr,
       access_ctrl = var.access_ctrl,
       numa_nodes_per_socket = var.numa_nodes_per_socket,
-      percentage_of_cores_enabled = var.percentage_of_cores_enabled
+      percentage_of_cores_enabled = var.percentage_of_cores_enabled,
+      healthchecks = var.healthchecks
       })
 
     destination   = "/opt/oci-hpc/conf/variables.tf"
diff --git a/variables.tf b/variables.tf
index 0cc7e5df..2dd46e16 100755
--- a/variables.tf
+++ b/variables.tf
@@ -261,7 +261,9 @@ variable "zone_name" {
 variable "dns_entries" {
   default = true
 }
-  
+variable "healthchecks" {
+  default = true
+}
 variable "BIOS" {
   default = false
 }

From fef6533274904b494c8f4842c0fbab3171ed7fae Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:30:43 -0600
Subject: [PATCH 09/36] Add healthcheck every 5 minutes on all idle nodes

---
 playbooks/roles/slurm/templates/slurm.conf.j2 | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2
index 19bf5502..0ea9259f 100755
--- a/playbooks/roles/slurm/templates/slurm.conf.j2
+++ b/playbooks/roles/slurm/templates/slurm.conf.j2
@@ -46,7 +46,11 @@ TopologyPlugin=topology/tree
 TopologyParam=SwitchAsNodeRank
 TreeWidth=2048
 SlurmctldParameters=enable_configless
-
+{% if healthchecks|bool %}
+HealthCheckProgram=/etc/slurm/prolog.d/healthchecks.sh
+HealthCheckInterval=300
+HealthCheckNodeState=NONDRAINED_IDLE,CYCLE
+{% endif %}
 {% if sacct_limits|bool %}
 AccountingStorageTRES=gres/gpu
 AccountingStorageEnforce=limits,associations,qos,safe

From e3418121958c48b16ce3dcd7ac4bd1e7dc620fe2 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:01:26 -0600
Subject: [PATCH 10/36] Fix SRAM errors on drivers 535.161.07 and above

---
 playbooks/roles/healthchecks/files/check_h100_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_h100_setup.py
index 99b0498d..408e9f8a 100644
--- a/playbooks/roles/healthchecks/files/check_h100_setup.py
+++ b/playbooks/roles/healthchecks/files/check_h100_setup.py
@@ -114,7 +114,7 @@ def check_ecc_errors():
     # Find the lines containing "SRAM Correctable" and "DRAM Correctable"
     sram_matches = re.findall(r'SRAM Uncorrectable\s+:\s+(\d+)', output)
     if len(sram_matches)==0:
-        sram_matches = re.findall(r'SRAM Uncorrectable SEC-DED\s+:\s+(\d+)', output)
+        sram_matches = re.findall(r'SRAM Uncorrectable Parity\s+:\s+(\d+)', output)
     dram_matches = re.findall(r'DRAM Uncorrectable\s+:\s+(\d+)', output)
     gpu_matches = re.findall(r'\nGPU\s+(.*)\n', output)
     vol_sram_line = sram_matches[0::2]

From 90a1fb063f83721796324f444678009749f8bd0e Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:42:40 -0600
Subject: [PATCH 11/36] Remove old images versions

---
 conf/variables.tpl | 8 +-------
 variables.tf       | 6 +-----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/conf/variables.tpl b/conf/variables.tpl
index c8bde956..d8b0047f 100755
--- a/conf/variables.tpl
+++ b/conf/variables.tpl
@@ -53,10 +53,6 @@ variable "marketplace_listing" {
 variable "marketplace_version_id" { 
   type = map(string) 
   default = { 
-       "1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607"
-       "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826"
-       "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229"
-       "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709"
        "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
        "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
        "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0"
@@ -69,9 +65,7 @@ variable "marketplace_version_id" {
 
 # To find the Appcatalog OCID, run 
 # oci compute pic listing list --display-name "Oracle Linux 7 - HPC Cluster Networking Image"
-variable "old_marketplace_listing_id" {
-    default = "ocid1.appcataloglisting.oc1..aaaaaaaahzcnanlki5vonyaeoiajjisejikzczygqqwheifymjqx3ft4iowa"
-}
+
 variable "marketplace_listing_id_HPC" {
     default = "ocid1.appcataloglisting.oc1..aaaaaaaahz2xiwfcsbebmqg7sp6lhdt6r2vsjro5jfukkl5cntlqvfhkbzaq"
 }
diff --git a/variables.tf b/variables.tf
index 2dd46e16..c6067f23 100755
--- a/variables.tf
+++ b/variables.tf
@@ -86,11 +86,7 @@ variable "marketplace_listing" {
 }  
 variable "marketplace_version_id" { 
   type = map(string) 
-  default = { 
-       "1" = "OL7.9-OFED5.3-1.0.0.1-RHCK-20210607"
-       "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826"
-       "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229"
-       "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709"
+  default = {
        "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
        "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
        "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0"

From 936e11e36b71fa10e2d2011eaa411443c46259c3 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:44:21 -0600
Subject: [PATCH 12/36] Fix RTTCC Check on A100 Blocks

---
 playbooks/roles/healthchecks/files/check_h100_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_h100_setup.py
index 408e9f8a..d61e8b6e 100644
--- a/playbooks/roles/healthchecks/files/check_h100_setup.py
+++ b/playbooks/roles/healthchecks/files/check_h100_setup.py
@@ -77,7 +77,7 @@ def check_rttcc_status():
             command = ['sudo', 'mlxreg', '-d', device, '-y', '--get', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0']
         else:
             command = ['mlxreg', '-d', device, '-y', '--set', 'cmd_type=3', '--reg_name=PPCC', '--indexes=local_port=1,pnat=0,lp_msb=0,algo_slot=0,algo_param_index=0']
-        result = subprocess.run(command, stdout=subprocess.PIPE)
+        result = subprocess.run(command, stdout=subprocess.PIPE,stderr=subprocess.PIPE)
         output = result.stdout.decode('utf-8')
         filtered_output = [line for line in output.split('\n') if line.startswith('value')]
         for line in filtered_output:

From 42fc627cd72ae82c3aefca5de2f98dabf7dea163 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 27 Mar 2024 11:49:14 -0600
Subject: [PATCH 13/36] Make healthchecks check valid for A100

---
 .../files/{check_h100_setup.py => check_gpu_setup.py}       | 6 +++---
 playbooks/roles/healthchecks/tasks/main.yml                 | 2 +-
 playbooks/roles/slurm/files/healthchecks.sh                 | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename playbooks/roles/healthchecks/files/{check_h100_setup.py => check_gpu_setup.py} (99%)

diff --git a/playbooks/roles/healthchecks/files/check_h100_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py
similarity index 99%
rename from playbooks/roles/healthchecks/files/check_h100_setup.py
rename to playbooks/roles/healthchecks/files/check_gpu_setup.py
index d61e8b6e..eb904a9f 100644
--- a/playbooks/roles/healthchecks/files/check_h100_setup.py
+++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py
@@ -340,7 +340,7 @@ def slurm_reason(message):
     slurm_error_count+=1
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Check H100 setup')
+    parser = argparse.ArgumentParser(description='Check Host setup')
     parser.add_argument("-l", "--log-level", choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], default="INFO", help="Set the logging level default: INFO")
     parser.add_argument('--bw-test', dest='bw_test', action='store_true', default=False, help='Run GPU bandwidth test (default: False)')
     parser.add_argument('--bw-test-exe', dest='bw_test_exe', help='Location to cuda-sampels bandwidthTest')
@@ -352,7 +352,7 @@ def slurm_reason(message):
     logger.setLevel(args.log_level)
 
     datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S')
-    logger.info(f"Started H100 setup check at: {datetime_str}")
+    logger.info(f"Started GPU host setup check at: {datetime_str}")
     try:
         oca_version = get_oca_version()
     except Exception as e:
@@ -440,7 +440,7 @@ def slurm_reason(message):
     slurm_drain_reason = ""
     slurm_error_count = 0
 
-    logger.info(f"--------- Summary of H100 setup check for {host_serial} ---------")
+    logger.info(f"--------- Summary of Host setup check for {host_serial} ---------")
     if oca_version < "1.39.0":
         logger.error(f"Oracle Cloud Agent: {oca_version} needs to be updated to 1.39.0 or higher")
         slurm_reason("OCA version Error")
diff --git a/playbooks/roles/healthchecks/tasks/main.yml b/playbooks/roles/healthchecks/tasks/main.yml
index 7ed13754..d42e3997 100755
--- a/playbooks/roles/healthchecks/tasks/main.yml
+++ b/playbooks/roles/healthchecks/tasks/main.yml
@@ -15,7 +15,7 @@
     owner: '{{ ansible_user }}'
     group: '{{ ansible_user }}'
   with_items: 
-    - check_h100_setup.py
+    - check_gpu_setup.py
     - gpu_bw_test.py
     - rdma_link_flapping.py
     - xid_checker.py
diff --git a/playbooks/roles/slurm/files/healthchecks.sh b/playbooks/roles/slurm/files/healthchecks.sh
index a2a9cec0..d54dd837 100644
--- a/playbooks/roles/slurm/files/healthchecks.sh
+++ b/playbooks/roles/slurm/files/healthchecks.sh
@@ -1,8 +1,8 @@
 #!/bin/sh
 shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
-if [ "${shape}" = \"BM.GPU.H100.8\" ]
+if [ "${shape}" = \"BM.GPU.H100.8\" ] || [ "${shape}" == \"BM.GPU.A100-v2.8\" ] || [ "${shape}" == \"BM.GPU4.8\" ] || [ "${shape}" == \"BM.GPU.B4.8\" ]
 then
-    sudo python3 /opt/oci-hpc/healthchecks/check_h100_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1
+    sudo python3 /opt/oci-hpc/healthchecks/check_gpu_setup.py --slurm > /tmp/latest_healthcheck.log 2>&1
     DRAIN_MSG=`cat /tmp/latest_healthcheck.log | grep "Healthcheck::"`
     if [ "$DRAIN_MSG" != "" ]
     then

From c32e9064c3688b974b70edbc7c641758e36c7edb Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:33:21 -0600
Subject: [PATCH 14/36] Check the right device per shape

---
 .../healthchecks/files/check_gpu_setup.py     | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py
index eb904a9f..1aac93a3 100644
--- a/playbooks/roles/healthchecks/files/check_gpu_setup.py
+++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py
@@ -10,7 +10,15 @@
 from xid_checker import XidChecker
 import platform
 import os
-import sys
+import requests
+
+def get_metadata():
+    """ Make a request to metadata endpoint """
+    headers = { 'Authorization' : 'Bearer Oracle' }
+    metadata_url = "http://169.254.169.254/opc/"
+    metadata_ver = "2"
+    request_url = metadata_url + "v" + metadata_ver + "/instance/"
+    return requests.get(request_url, headers=headers).json()
 
 def is_user_root():
     # Check if the user is root
@@ -189,8 +197,14 @@ def check_row_remap_errors():
 
 def check_rdma_link_status():
     status = True
-    devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
-
+    metadata=get_metadata()
+    shape=metadata['shape']
+    if shape == "BM.GPU.H100.8":
+        devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
+    elif shape == "BM.GPU.B4.8" or shape == "BM.GPU.A100-v2.8":
+        devices = ["mlx5_1", "mlx5_2", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
+    elif shape == "BM.GPU.4.8":
+        devices = ["mlx5_0", "mlx5_1", "mlx5_2", "mlx5_3", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
     link_issues = []
     for device in devices:
         # Run the mlxlink command
@@ -501,7 +515,7 @@ def slurm_reason(message):
         slurm_reason("Missing GPU Error")
 
     datetime_str = datetime.now().strftime('%Y-%m-%d-%H%M%S')
-    logger.info(f"Finished H100 setup check at: {datetime_str}")
+    logger.info(f"Finished GPU host setup check at: {datetime_str}")
 
     if slurm_error_count > 0 and args.slurm:
         print("Healthcheck:: "+slurm_drain_reason[:-1])
\ No newline at end of file

From b682f68789b9b9435b1ef264c43fa464a7962b5f Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:21:57 -0600
Subject: [PATCH 15/36] Correct GPU4.8 Shape in Healthcheck

---
 playbooks/roles/healthchecks/files/check_gpu_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py
index 1aac93a3..a57703e4 100644
--- a/playbooks/roles/healthchecks/files/check_gpu_setup.py
+++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py
@@ -203,7 +203,7 @@ def check_rdma_link_status():
         devices = ["mlx5_0", "mlx5_1", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
     elif shape == "BM.GPU.B4.8" or shape == "BM.GPU.A100-v2.8":
         devices = ["mlx5_1", "mlx5_2", "mlx5_3", "mlx5_4", "mlx5_5", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
-    elif shape == "BM.GPU.4.8":
+    elif shape == "BM.GPU4.8":
         devices = ["mlx5_0", "mlx5_1", "mlx5_2", "mlx5_3", "mlx5_6", "mlx5_7", "mlx5_8", "mlx5_9", "mlx5_10", "mlx5_11", "mlx5_12", "mlx5_13", "mlx5_14", "mlx5_15", "mlx5_16", "mlx5_17"]
     link_issues = []
     for device in devices:

From e01daae32b3c14823a3c368ca381f46c6b04d428 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Thu, 11 Apr 2024 16:55:19 -0600
Subject: [PATCH 16/36] Fix link flapping if the flapping happened close to
 boot

---
 playbooks/roles/healthchecks/files/rdma_link_flapping.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/playbooks/roles/healthchecks/files/rdma_link_flapping.py b/playbooks/roles/healthchecks/files/rdma_link_flapping.py
index 425ec54e..2b4b8e8a 100644
--- a/playbooks/roles/healthchecks/files/rdma_link_flapping.py
+++ b/playbooks/roles/healthchecks/files/rdma_link_flapping.py
@@ -102,7 +102,7 @@ def process_rdma_link_flapping(self):
                     logger.debug(f"RDMA link ({interface}) failed  {diff_hours} hours ago")
 
                     logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}")
-                    if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period:
+                    if diff_hours < self.time_interval and last_date_failure_sec > bootup_time_grace_period:
                         logger.debug(f"{interface}: one or more RDMA link flapping events within {self.time_interval} hours. Last flapping event: {last_date_failure_str})")
                         link_issues["failures"].append(f"{interface}: {len(self.link_data[interface]['failures'])}")
                         status = -1
@@ -135,7 +135,7 @@ def process_rdma_link_flapping(self):
                     logger.debug(f"RDMA link ({interface}) down  {diff_hours} hours ago")
                     
                     logger.debug(f"bootup_time_sec: {bootup_time_sec}, boot_time_grace_period: {bootup_time_grace_period}, current_date_sec: {current_date_sec}, diff_secs: {diff_secs}, diff_hours: {diff_hours}")
-                    if diff_hours < self.time_interval and current_date_sec > bootup_time_grace_period:
+                    if diff_hours < self.time_interval and last_date_down_sec > bootup_time_grace_period:
                         logger.debug(f"{interface}, one or more RDMA link down events within {self.time_interval} hours. Last link down event: {last_date_down_str}")
                         link_issues["link_down"].append(f"{interface}: {len(self.link_data[interface]['link_down'])}")
                         status = -2

From 4fd28977bee116fb6f0bfc6e3a7aba001272e901 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Sat, 13 Apr 2024 00:27:40 -0600
Subject: [PATCH 17/36] Update NCCL tuning parameters

---
 samples/gpu/nccl_run_allreduce_H100.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/samples/gpu/nccl_run_allreduce_H100.sh b/samples/gpu/nccl_run_allreduce_H100.sh
index 2fd714ea..1113978b 100644
--- a/samples/gpu/nccl_run_allreduce_H100.sh
+++ b/samples/gpu/nccl_run_allreduce_H100.sh
@@ -57,7 +57,7 @@ do
   --bind-to numa \
   -npernode 8 \
   --mca coll ^hcoll \
-  -x NCCL_CROSS_NIC=1 \
+  -x NCCL_CROSS_NIC=2 \
   -x NCCL_DEBUG=WARN \
   -x NCCL_CUMEM_ENABLE=0 \
   -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
@@ -78,6 +78,7 @@ do
   -x NCCL_IGNORE_CPU_AFFINITY=1 \
   -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
   -x NCCL_TOPO_FILE=~/H100-topology.xml \
+  -x NCCL_MIN_NCHANNELS=32 \
   --np $np --hostfile $hostfile  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >>  $logfile
 
   tail -n 32 $logfile

From ec221f416469020b7201147c11b6e896c5ba72f1 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 16 Apr 2024 12:04:20 -0600
Subject: [PATCH 18/36] Add root check

---
 playbooks/roles/healthchecks/files/xid_checker.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/playbooks/roles/healthchecks/files/xid_checker.py b/playbooks/roles/healthchecks/files/xid_checker.py
index eaa6360b..3c46f36a 100644
--- a/playbooks/roles/healthchecks/files/xid_checker.py
+++ b/playbooks/roles/healthchecks/files/xid_checker.py
@@ -5,9 +5,14 @@
 import subprocess
 import sys
 import re
+import os
 
 class XidChecker:
     def __init__(self, dmesg_cmd="dmesg", time_interval=60):
+        # if user is root
+        if not os.geteuid() == 0:
+            logger.info("The XidChecker script did not run since it must be run as root")
+            sys.exit(1)
         self.dmesg_cmd = dmesg_cmd
         self.results = {}
 

From bf9af6e40a9fb8665046a5c51e82adaf73bf5f71 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 16 Apr 2024 14:45:40 -0600
Subject: [PATCH 19/36] Update to latest version

---
 autoscaling/tf_init/versions.tf | 2 +-
 versions.tf                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf
index 28a169ec..ec66572c 100755
--- a/autoscaling/tf_init/versions.tf
+++ b/autoscaling/tf_init/versions.tf
@@ -3,7 +3,7 @@ terraform {
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "5.34.0"
+         version = "5.37.0"
      }
   }
 }
\ No newline at end of file
diff --git a/versions.tf b/versions.tf
index 28a169ec..ec66572c 100755
--- a/versions.tf
+++ b/versions.tf
@@ -3,7 +3,7 @@ terraform {
   required_providers {
      oci = {
          source = "oracle/oci"
-         version = "5.34.0"
+         version = "5.37.0"
      }
   }
 }
\ No newline at end of file

From 6bdf4d404ef365ec3b5470be133cf703cddd8031 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Mon, 22 Apr 2024 17:50:17 -0600
Subject: [PATCH 20/36] Check Physical Error in case of bad signal integrity

---
 playbooks/roles/healthchecks/files/check_gpu_setup.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py
index a57703e4..35663e3d 100644
--- a/playbooks/roles/healthchecks/files/check_gpu_setup.py
+++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py
@@ -233,7 +233,7 @@ def check_rdma_link_status():
         vendor_serial_num = re.search(r'Vendor Serial Number.*', output).group().split(":")[1].strip()
         nic_fw_version = re.search(r'Firmware Version.*', output).group().split(":")[1].strip()
         cable_fw_version = re.search(r'FW Version.*', output).group().split(":")[1].strip()
-
+        physical_BER = re.search(r'Raw Physical BER.*', output).group().split(":")[1].strip()
         # Remove hidden characters from the output
         link_state = re.sub(color_pattern, '', link_state)
         nic_fw_version = re.sub(color_pattern, '', nic_fw_version)
@@ -248,8 +248,12 @@ def check_rdma_link_status():
             status = False
         if recommendation != "No issue was observed":
             logger.debug(f"{device}: {recommendation}")
-            link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
-            status = False
+            if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-09:
+                logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored")
+            else : 
+                logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}")
+                link_issues.append(f"{device} - {vendor_serial_num} - {cable_fw_version} - {nic_fw_version}: {recommendation}")
+                status = False
         else:
             logger.debug(f"{device}: {recommendation}")
 

From c929c9b47b466be13af8ca51c2fa0efdbe085988 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Mon, 22 Apr 2024 17:58:43 -0600
Subject: [PATCH 21/36] Change default H100 values

---
 samples/gpu/nccl_run_allreduce_H100.sbatch | 16 +++++++++++-----
 samples/gpu/nccl_run_allreduce_H100.sh     | 14 ++++++++++----
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/samples/gpu/nccl_run_allreduce_H100.sbatch b/samples/gpu/nccl_run_allreduce_H100.sbatch
index efef481e..333da972 100644
--- a/samples/gpu/nccl_run_allreduce_H100.sbatch
+++ b/samples/gpu/nccl_run_allreduce_H100.sbatch
@@ -43,11 +43,11 @@ fi
   --bind-to numa \
   -npernode 8 \
   --mca coll ^hcoll \
-  -x NCCL_CROSS_NIC=1 \
+  -x NCCL_CROSS_NIC=2 \
   -x NCCL_DEBUG=WARN \
   -x NCCL_CUMEM_ENABLE=0 \
   -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
-  -x NCCL_IB_QPS_PER_CONNECTION=16 \
+  -x NCCL_IB_QPS_PER_CONNECTION=1 \
   -x NCCL_IB_GID_INDEX=3 \
   -x NCCL_IB_TC=41 \
   -x NCCL_IB_SL=0 \
@@ -59,11 +59,17 @@ fi
   -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
   -x RX_QUEUE_LEN=8192 \
   -x IB_RX_QUEUE_LEN=8192 \
-  -x NCCL_BUFFSIZE=16777216 \
-  -x NCCL_SOCKET_IFNAME=eth0 \
+  -x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \
   -x NCCL_IGNORE_CPU_AFFINITY=1 \
   -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
   --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --hostfile $MACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1
 
   # If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances 
-  # -x NCCL_TOPO_FILE=~/H100-topology.xml \
\ No newline at end of file
+  # -x NCCL_TOPO_FILE=~/H100-topology.xml \
+
+  # If NCCL version is lower than 2.20.3, it is recommended to use 
+  # -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes)
+  # -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes)
+  
+  # If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with
+  # -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs. 
\ No newline at end of file
diff --git a/samples/gpu/nccl_run_allreduce_H100.sh b/samples/gpu/nccl_run_allreduce_H100.sh
index 1113978b..56207fc7 100644
--- a/samples/gpu/nccl_run_allreduce_H100.sh
+++ b/samples/gpu/nccl_run_allreduce_H100.sh
@@ -61,7 +61,7 @@ do
   -x NCCL_DEBUG=WARN \
   -x NCCL_CUMEM_ENABLE=0 \
   -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \
-  -x NCCL_IB_QPS_PER_CONNECTION=16 \
+  -x NCCL_IB_QPS_PER_CONNECTION=1 \
   -x NCCL_IB_GID_INDEX=3 \
   -x NCCL_IB_TC=41 \
   -x NCCL_IB_SL=0 \
@@ -74,11 +74,10 @@ do
   -x RX_QUEUE_LEN=8192 \
   -x IB_RX_QUEUE_LEN=8192 \
   -x NCCL_BUFFSIZE=16777216 \
-  -x NCCL_SOCKET_IFNAME=eth0 \
+  -x NCCL_SOCKET_IFNAME=${var_UCX_NET_DEVICES} \
   -x NCCL_IGNORE_CPU_AFFINITY=1 \
   -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
   -x NCCL_TOPO_FILE=~/H100-topology.xml \
-  -x NCCL_MIN_NCHANNELS=32 \
   --np $np --hostfile $hostfile  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b 1G -e 16G -f 2 -g 1 >>  $logfile
 
   tail -n 32 $logfile
@@ -87,4 +86,11 @@ done
 
 
   # If NCCL version is lower than 2.20.3, it is recommended to use the topology filefor optimal performances 
-  # -x NCCL_TOPO_FILE=~/H100-topology.xml \
\ No newline at end of file
+  # -x NCCL_TOPO_FILE=~/H100-topology.xml \
+
+  # If NCCL version is lower than 2.20.3, it is recommended to use 
+  # -x NCCL_CROSS_NIC=0 for multiple subnets and large scale jobs (>16 nodes)
+  # -x NCCL_CROSS_NIC=1 for single subnets and small scale jobs (<16 nodes)
+
+  # If NCCL version is higher than 2.20.3, the absolute max NCCL throughput at large message size will be obtained with
+  # -x NCCL_MIN_NCHANNELS=32 \ But it does take some processing power away from the GPU for networking gains and is not recommended while running jobs. 
\ No newline at end of file

From b5cd23bc05543bdfbe26ce4e3230184ccc88c721 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Fri, 26 Apr 2024 13:10:18 -0600
Subject: [PATCH 22/36] Remove duplicate spack

---
 playbooks/resize_add.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml
index 47b7071b..8a599590 100755
--- a/playbooks/resize_add.yml
+++ b/playbooks/resize_add.yml
@@ -113,9 +113,6 @@
     - include_role:
         name: sssd
       when: ldap|default(true)|bool
-    - include_role: 
-        name: spack
-      when: spack|default(false)|bool
 
 - hosts: compute_to_add
   become: true

From 3b3624b46144f16af57112c6a7e75325489e1531 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 1 May 2024 10:13:41 -0600
Subject: [PATCH 23/36] Change the BER limits to 10E-7

---
 playbooks/roles/healthchecks/files/check_gpu_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/playbooks/roles/healthchecks/files/check_gpu_setup.py b/playbooks/roles/healthchecks/files/check_gpu_setup.py
index 35663e3d..ed9132c0 100644
--- a/playbooks/roles/healthchecks/files/check_gpu_setup.py
+++ b/playbooks/roles/healthchecks/files/check_gpu_setup.py
@@ -248,7 +248,7 @@ def check_rdma_link_status():
             status = False
         if recommendation != "No issue was observed":
             logger.debug(f"{device}: {recommendation}")
-            if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-09:
+            if "Bad signal integrity" in recommendation and float(physical_BER) < 1e-07:
                 logger.debug(f"Recommandation is {recommendation} but the Physical error are low enough that it can be ignored")
             else : 
                 logger.debug(f"Recommandation is {recommendation} and the Physical error count is too high to be ignored: {physical_BER}")

From 9d8050a6e59288b02b467bdac9e7ca5ac174bbb8 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 1 May 2024 10:14:16 -0600
Subject: [PATCH 24/36] Change default NCCL.conf for H100

---
 playbooks/roles/nccl-conf/files/h100 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/playbooks/roles/nccl-conf/files/h100 b/playbooks/roles/nccl-conf/files/h100
index d199d0fb..9fcc8296 100644
--- a/playbooks/roles/nccl-conf/files/h100
+++ b/playbooks/roles/nccl-conf/files/h100
@@ -1,8 +1,8 @@
-NCCL_CROSS_NIC=1
+NCCL_CROSS_NIC=2
 NCCL_DEBUG=WARN
 NCCL_CUMEM_ENABLE=0
 NCCL_IB_SPLIT_DATA_ON_QPS=0
-NCCL_IB_QPS_PER_CONNECTION=16
+NCCL_IB_QPS_PER_CONNECTION=1
 NCCL_IB_GID_INDEX=3
 NCCL_IB_TC=41
 NCCL_IB_SL=0

From 7f499aeca614c81269000e9d00e798145d659f16 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 1 May 2024 10:14:44 -0600
Subject: [PATCH 25/36] Don't include Tuner in the stack

---
 playbooks/roles/nccl-conf/tasks/main.yml | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/playbooks/roles/nccl-conf/tasks/main.yml b/playbooks/roles/nccl-conf/tasks/main.yml
index e4b6aed4..88c9dc36 100644
--- a/playbooks/roles/nccl-conf/tasks/main.yml
+++ b/playbooks/roles/nccl-conf/tasks/main.yml
@@ -33,24 +33,4 @@
     owner: root
     group: root
     mode: '0644'
-  when: shape_nccl.stdout == '"BM.GPU4.8"'
-
-- name: copy libnccl-ocituner for OL
-  become: true
-  get_url: 
-    url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-OL
-    dest: /home/opc/libnccl-ocituner.so.1.0.1
-    owner: opc
-    group: privilege
-    mode: '0775'
-  when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_distribution == 'OracleLinux'
-
-- name: copy libnccl-ocituner for Ubuntu
-  become: true
-  get_url: 
-    url: https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/m1Gdcbiguqst6n_aVwRZIFpRZxUG-wGMvqWS5QJeJbIvNZnqTTA3N1_DDRuYpvJx/n/hpc/b/source/o/tuner/libnccl-ocituner.so.1.0.1-ubuntu
-    dest: /home/ubuntu/libnccl-ocituner.so.1.0.1
-    owner: ubuntu
-    group: privilege
-    mode: '0775'
-  when: ( shape_nccl.stdout == '"BM.GPU.B4.8"' or shape_nccl.stdout == '"BM.GPU.A100-v2.8"' or shape_nccl.stdout == '"BM.GPU4.8"' ) and ansible_os_family == 'Debian'
\ No newline at end of file
+  when: shape_nccl.stdout == '"BM.GPU4.8"'
\ No newline at end of file

From 03292f69cc3a65ae0b50fa868a1c46d81fa50b11 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Fri, 3 May 2024 09:32:04 -0600
Subject: [PATCH 26/36] Make warning message more explicit for BIOS changes

---
 schema.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/schema.yaml b/schema.yaml
index 033117fd..66946082 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -837,7 +837,8 @@ variables:
 
   BIOS: 
     title: "Modify BIOS options" 
-    description: "Make sure that the BIOS options are changeable for the specific shape selected"
+    description: "WARNING : Do NOT change those if you have not tested the changes on a single instance. Error will be \"Shape does not support the provided platform
+configuration\" "
     type: boolean
     default: false
     visible: true

From f08d5d9e28b49f0e4349f6a09fc301a7795d07aa Mon Sep 17 00:00:00 2001
From: Dhvani Sheth <shethdhvani@gmail.com>
Date: Wed, 8 May 2024 15:20:31 -0700
Subject: [PATCH 27/36] by default, disable scratch nfs

---
 schema.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema.yaml b/schema.yaml
index 033117fd..b3f2c948 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -907,7 +907,7 @@ variables:
     visible:
       and:
         - ${use_advanced}
-    default: true
+    default: false
 
   scratch_nfs_type_cluster:
     type: enum

From fa5bb5880939753f1bbd2dc352c544dc990c4de4 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 14 May 2024 13:52:08 -0600
Subject: [PATCH 28/36] Change healthchecks default to False

---
 schema.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema.yaml b/schema.yaml
index 66946082..10504223 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -1201,7 +1201,7 @@ configuration\" "
   healthchecks:
     type: boolean
     title: "Turn on Healthchecks for GPU nodes"
-    default: true
+    default: false
     description: "Will run tests on GPU nodes before starting a job. Nodes that are showing issues will be set in drain state"
     visible: ${slurm}
  

From 9ba74808c253a83bc2e30b2e2f2fa3c2ad9b256e Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 14 May 2024 13:53:02 -0600
Subject: [PATCH 29/36] Add list of unreachable instances in resize

---
 bin/resize.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bin/resize.py b/bin/resize.py
index 9525fee4..03a6f58e 100644
--- a/bin/resize.py
+++ b/bin/resize.py
@@ -753,6 +753,7 @@ def getLaunchInstanceDetails(instance,comp_ocid,cn_ocid,max_previous_index,index
         if len(unreachable_instances):
             if not remove_unreachable:
                 print("STDOUT: At least one unreachable node is in the inventory")
+                print(unreachable_instances)
                 print("STDOUT: Not doing anything")
                 exit(1)
             else:

From 57adf1e6c0aa92b890b3abce702ced4a13365ab5 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 14 May 2024 13:53:20 -0600
Subject: [PATCH 30/36] Add Tuner example

---
 samples/gpu/nccl_run_allreduce_tuner.sbatch |  88 +++++++++++++++++
 samples/gpu/nccl_run_allreduce_tuner.sh     | 103 ++++++++++++++++++++
 2 files changed, 191 insertions(+)
 create mode 100644 samples/gpu/nccl_run_allreduce_tuner.sbatch
 create mode 100644 samples/gpu/nccl_run_allreduce_tuner.sh

diff --git a/samples/gpu/nccl_run_allreduce_tuner.sbatch b/samples/gpu/nccl_run_allreduce_tuner.sbatch
new file mode 100644
index 00000000..f6924781
--- /dev/null
+++ b/samples/gpu/nccl_run_allreduce_tuner.sbatch
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --job-name=nccl-allreduce-slurm
+#SBATCH --nodes=2
+#SBATCH --gpus-per-node=8
+#SBATCH --ntasks-per-node=8
+#SBATCH --exclusive
+export PMI_DEBUG=1
+
+
+cd /nfs/scratch
+mkdir $SLURM_JOB_ID
+cd $SLURM_JOB_ID
+
+MACHINEFILE="hostfile"
+ORDEREDMACHINEFILE="ordered_hostfile_system_name"
+ORDEREDRANKMACHINEFILE="rankfile_system_name"
+
+scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
+echo MACHINEFILE
+cat $MACHINEFILE
+
+source /etc/os-release
+if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
+    python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
+elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
+    python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null
+fi
+
+
+echo ORDEREDMACHINEFILE
+cat $ORDEREDMACHINEFILE
+echo ORDEREDRANKMACHINEFILE
+cat $ORDEREDRANKMACHINEFILE
+
+mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
+
+if [[ "$mpivars_path" == "" ]]; then
+    mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
+fi
+
+if [[ "$mpivars_path" == "" ]]; then
+    echo "Could not find MPIPATH"; exit; fi
+
+source $mpivars_path
+
+export NCCL_DEBUG=WARN
+
+#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa  -x NCCL_DEBUG=WARN  -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19"   -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm  -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name  /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9))  -n 100
+# no need to pass:  -x SLURM_JOB_NODELIST=$host_list
+
+shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
+if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_0:1
+  var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
+elif [ $shape == \"BM.GPU4.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_4:1
+  var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
+fi
+
+NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'`
+arr_NCCL=(${NCCL_version//./ })
+if [ ${arr_NCCL[2]} > 20 ]
+then
+  tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1
+else
+  tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2
+fi
+
+
+  mpirun --mca pml ucx \
+  --bind-to numa \
+  --mca coll ^hcoll \
+  -x NCCL_DEBUG=WARN \
+  -x NCCL_IB_SL=0 \
+  -x NCCL_IB_TC=41 \
+  -x NCCL_IB_QPS_PER_CONNECTION=4 \
+  -x UCX_TLS=ud,self,sm \
+  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
+  -x HCOLL_ENABLE_MCAST_ALL=0 \
+  -x coll_hcoll_enable=0 \
+  -x NCCL_IB_GID_INDEX=3 \
+  -x NCCL_TUNER_PLUGIN=${tuner_path} \
+  -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
+  --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE))  --rankfile $ORDEREDRANKMACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
+
+
diff --git a/samples/gpu/nccl_run_allreduce_tuner.sh b/samples/gpu/nccl_run_allreduce_tuner.sh
new file mode 100644
index 00000000..a8da9ba0
--- /dev/null
+++ b/samples/gpu/nccl_run_allreduce_tuner.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+set -e
+
+# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
+max=$1
+
+# This assume, the hostfile  passed is already ordered based on their rackId
+if [ -n "$2" ]; then
+  hostfile=$2
+else
+  #hostfile="/home/opc/hostfile.tcp"
+  #hostfile="/etc/opt/oci-hpc/hostfile.tcp"
+  hostfile="/tmp/ordered_hostfile_system_name"
+fi
+
+ORDEREDMACHINEFILE="ordered_hostfile_system_name"
+ORDEREDRANKMACHINEFILE="rankfile_system_name"
+echo INPUTFILE
+cat $hostfile
+
+# will generate rack-aware ordered host file
+source /etc/os-release
+if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
+    python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
+elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
+    python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
+fi
+
+hostfile=$ORDEREDMACHINEFILE
+rankfile=$ORDEREDRANKMACHINEFILE
+
+echo ORDEREDMACHINEFILE
+cat $ORDEREDMACHINEFILE
+echo ORDEREDRANKMACHINEFILE
+cat $ORDEREDRANKMACHINEFILE
+
+# The number of GPUs to use for the test.  Has to be multiplier of 8.  If not passed, all GPUs will be used. 
+if [ -n "$3" ]; then
+  np=$3
+else
+  np=$((`less $hostfile | wc -l` * 8 ))
+fi
+
+logfile="nccl_run_allreduce.sh.log"
+
+for x in $(seq 1 1 $max)
+do
+
+  echo $x
+  echo $x >> $logfile
+  date >> $logfile
+
+  rankfile=$rankfile; np=$np ; iter=20;
+
+  mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
+  source $mpivars_path
+
+  if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi
+
+first_node=`head $hostfile -n 1`
+shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
+if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_0:1
+  var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12"
+elif [ $shape == \"BM.GPU4.8\" ]
+then
+  var_UCX_NET_DEVICES=mlx5_4:1
+  var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17"
+fi
+
+NCCL_version=`sudo ldconfig -v 2>&1 | grep "libnccl.so" | tail -n1 | sed -r 's/^.*\.so\.//'`
+arr_NCCL=(${NCCL_version//./ })
+if [ ${arr_NCCL[2]} < 21 ]
+then
+  tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.1.0.2
+else
+  tuner_path=/opt/oci-hpc/oci-tuner/libnccl-ocituner-A100.so.2.0.1
+fi
+
+  # final version
+  mpirun --mca pml ucx \
+  --bind-to numa \
+  --mca coll ^hcoll \
+  -x NCCL_DEBUG=WARN \
+  -x NCCL_IB_SL=0 \
+  -x NCCL_IB_TC=41 \
+  -x NCCL_IB_QPS_PER_CONNECTION=4 \
+  -x UCX_TLS=ud,self,sm \
+  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
+  -x HCOLL_ENABLE_MCAST_ALL=0 \
+  -x coll_hcoll_enable=0 \
+  -x NCCL_IB_GID_INDEX=3 \
+  -x NCCL_ALGO=Ring \
+  -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \
+  --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >>  $logfile
+
+  tail -n 32 $logfile
+
+
+done
+
+

From b2f1d732dd0ec614e5bd36c081ce2bd0f7052f4e Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 14 May 2024 13:53:36 -0600
Subject: [PATCH 31/36] Remove old tuner examples

---
 ..._ncclparam_tuner_nccl_run_allreduce.sbatch | 65 --------------
 .../no_ncclparam_tuner_nccl_run_allreduce.sh  | 87 -------------------
 2 files changed, 152 deletions(-)
 delete mode 100644 samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch
 delete mode 100644 samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh

diff --git a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch b/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch
deleted file mode 100644
index 7de77737..00000000
--- a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sbatch
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=nccl-allreduce-slurm
-#SBATCH --nodes=2
-#SBATCH --gpus-per-node=8
-#SBATCH --ntasks-per-node=8
-#SBATCH --exclusive
-export PMI_DEBUG=1
-
-
-cd /nfs/cluster
-mkdir $SLURM_JOB_ID
-cd $SLURM_JOB_ID
-
-MACHINEFILE="hostfile"
-ORDEREDMACHINEFILE="ordered_hostfile_system_name"
-ORDEREDRANKMACHINEFILE="rankfile_system_name"
-
-scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE
-echo MACHINEFILE
-cat $MACHINEFILE
-
-source /etc/os-release
-if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
-    python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
-    homedirectory=/home/opc
-elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
-    python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
-    homedirectory=/home/ubuntu
-fi
-
-
-echo ORDEREDMACHINEFILE
-cat $ORDEREDMACHINEFILE
-echo ORDEREDRANKMACHINEFILE
-cat $ORDEREDRANKMACHINEFILE
-
-mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
-
-if [[ "$mpivars_path" == "" ]]; then
-    mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh`
-fi
-
-if [[ "$mpivars_path" == "" ]]; then
-    echo "Could not find MPIPATH"; exit; fi
-
-source $mpivars_path
-
-shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape`
-if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
-then
-  var_UCX_NET_DEVICES=mlx5_0:1
-elif [ $shape == \"BM.GPU4.8\" ]
-then
-  var_UCX_NET_DEVICES=mlx5_4:1
-fi
-
-  mpirun --mca pml ucx \
-  --bind-to numa \
-  --mca coll ^hcoll \
-  -x UCX_TLS=ud,self,sm \
-  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
-  -x HCOLL_ENABLE_MCAST_ALL=0 \
-  -x coll_hcoll_enable=0 \
-  -x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \
-  --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE))  --rankfile $ORDEREDRANKMACHINEFILE  /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100
diff --git a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh b/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh
deleted file mode 100644
index 25a496e3..00000000
--- a/samples/gpu/no_ncclparam_tuner_nccl_run_allreduce.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-set -e
-
-# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter
-max=$1
-
-# This assume, the hostfile  passed is already ordered based on their rackId
-if [ -n "$2" ]; then
-  hostfile=$2
-else
-  hostfile="/tmp/ordered_hostfile_system_name"
-fi
-
-ORDEREDMACHINEFILE="ordered_hostfile_system_name"
-ORDEREDRANKMACHINEFILE="rankfile_system_name"
-echo INPUTFILE
-cat $hostfile
-
-# will generate rack-aware ordered host file
-source /etc/os-release
-if [ $ID == "ol" ] || [ $ID == "centos" ] ; then
-    python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null
-    homedirectory=/home/opc
-elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then
-    python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null
-    homedirectory=/home/ubuntu
-fi
-
-hostfile=$ORDEREDMACHINEFILE
-rankfile=$ORDEREDRANKMACHINEFILE
-
-echo ORDEREDMACHINEFILE
-cat $ORDEREDMACHINEFILE
-echo ORDEREDRANKMACHINEFILE
-cat $ORDEREDRANKMACHINEFILE
-
-# The number of GPUs to use for the test.  Has to be multiplier of 8.  If not passed, all GPUs will be used. 
-if [ -n "$3" ]; then
-  np=$3
-else
-  np=$((`less $hostfile | wc -l` * 8 ))
-fi
-
-logfile="nccl_run_allreduce.sh.log"
-
-for x in $(seq 1 1 $max)
-do
-
-  echo $x
-  echo $x >> $logfile
-  date >> $logfile
-
-  rankfile=$rankfile; np=$np ; iter=20;
-
-  mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh`
-  source $mpivars_path
-
-  if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi
-
-first_node=`head $hostfile -n 1`
-shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape`
-if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ]
-then
-  var_UCX_NET_DEVICES=mlx5_0:1
-elif [ $shape == \"BM.GPU4.8\" ]
-then
-  var_UCX_NET_DEVICES=mlx5_4:1
-fi
-
-  # final version
-  # all NCCL parameters are at /etc/nccl.conf on each compute node.
-  mpirun --mca pml ucx \
-  --bind-to numa \
-  --mca coll ^hcoll \
-  -x UCX_TLS=ud,self,sm \
-  -x UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \
-  -x HCOLL_ENABLE_MCAST_ALL=0 \
-  -x coll_hcoll_enable=0 \
-  -x NCCL_TUNER_PLUGIN=$homedirectory/libnccl-ocituner.so.1.0.1 \
-  --np $np --rankfile $rankfile /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >>  $logfile
-
-  tail -n 32 $logfile
-
-
-done
-
-

From 5241b3da9165cc468c3a95bb19431ce635a4f1cb Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Tue, 14 May 2024 13:53:54 -0600
Subject: [PATCH 32/36] Add GPU and RDMA monitoring

---
 .../tf_init/cluster-network-configuration.tf    |  8 +++++++-
 .../tf_init/instance-pool-configuration.tf      | 17 ++++++++++++++++-
 cluster-network-configuration.tf                |  8 +++++++-
 compute-nodes.tf                                |  8 +++++++-
 instance-pool-configuration.tf                  | 17 ++++++++++++++++-
 5 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/autoscaling/tf_init/cluster-network-configuration.tf b/autoscaling/tf_init/cluster-network-configuration.tf
index 6b2805f1..3b12b2f9 100755
--- a/autoscaling/tf_init/cluster-network-configuration.tf
+++ b/autoscaling/tf_init/cluster-network-configuration.tf
@@ -41,7 +41,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
           name = "Compute HPC RDMA Auto-Configuration"
           desired_state = plugins_config.value
           }
-
+        }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
       }
       dynamic "platform_config" {
diff --git a/autoscaling/tf_init/instance-pool-configuration.tf b/autoscaling/tf_init/instance-pool-configuration.tf
index 31c31ab7..16f8f32c 100755
--- a/autoscaling/tf_init/instance-pool-configuration.tf
+++ b/autoscaling/tf_init/instance-pool-configuration.tf
@@ -18,8 +18,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
         user_data           = base64encode(data.template_file.config.rendered)
       }
       agent_config {
-        is_management_disabled = true
+
+        are_all_plugins_disabled = false
+        is_management_disabled   = true
+        is_monitoring_disabled   = false
+
+        plugins_config {
+          desired_state = "DISABLED"
+          name          = "OS Management Service Agent"
+          }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
+      }
       shape = var.instance_pool_shape
 
       dynamic "shape_config" {
diff --git a/cluster-network-configuration.tf b/cluster-network-configuration.tf
index 1c097ca5..f2772b2a 100755
--- a/cluster-network-configuration.tf
+++ b/cluster-network-configuration.tf
@@ -45,7 +45,13 @@ resource "oci_core_instance_configuration" "cluster-network-instance_configurati
           name = "Compute HPC RDMA Auto-Configuration"
           desired_state = plugins_config.value
           }
-
+        }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
       }
 
diff --git a/compute-nodes.tf b/compute-nodes.tf
index 1544c5ad..4149e958 100755
--- a/compute-nodes.tf
+++ b/compute-nodes.tf
@@ -48,7 +48,13 @@ resource "oci_core_instance" "compute_cluster_instances" {
           name = "Compute HPC RDMA Auto-Configuration"
           desired_state = plugins_config.value
           }
-
+        }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.cluster_network_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
       }
 
diff --git a/instance-pool-configuration.tf b/instance-pool-configuration.tf
index b28dbe5c..1fffd744 100755
--- a/instance-pool-configuration.tf
+++ b/instance-pool-configuration.tf
@@ -22,8 +22,23 @@ resource "oci_core_instance_configuration" "instance_pool_configuration" {
         user_data           = base64encode(data.template_file.config.rendered)
       }
       agent_config {
-        is_management_disabled = true
+
+        are_all_plugins_disabled = false
+        is_management_disabled   = true
+        is_monitoring_disabled   = false
+
+        plugins_config {
+          desired_state = "DISABLED"
+          name          = "OS Management Service Agent"
+          }
+        dynamic plugins_config {
+          for_each = length(regexall(".*GPU.*", var.instance_pool_shape)) > 0 ? ["ENABLED"] : ["DISABLED"]
+          content {
+          name = "Compute RDMA GPU Monitoring"
+          desired_state = plugins_config.value
+          }
         }
+      }
       shape = var.instance_pool_shape
 
       dynamic "shape_config" {

From 6c14e8152e53f7b25216836bffb4b7052a36bc22 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 15 May 2024 09:55:20 -0600
Subject: [PATCH 33/36] Update images

---
 schema.yaml  | 30 +++++++++++++++---------------
 variables.tf | 12 ++++++------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/schema.yaml b/schema.yaml
index 10504223..a2bea948 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -418,11 +418,11 @@ variables:
     enum:
       - "HPC_OL7"
       - "HPC_OL8"
-      - "GPU_OL7_CUDA12.2"
-      - "GPU_OL8_CUDA12.2"
-      - "GPU_OL7_CUDA12.4"
-      - "GPU_OL8_CUDA12.4"
-    default: "HPC_OL7"
+      - "GPU_OL8_NV550"
+      - "GPU_OL7_NV550"
+      - "GPU_OL8_NV535"
+      - "GPU_OL7_NV535"
+    default: "GPU_OL8_NV550"
     visible: ${use_marketplace_image_controller}
 
   controller_username: 
@@ -762,11 +762,11 @@ variables:
     enum:
       - "HPC_OL7"
       - "HPC_OL8"
-      - "GPU_OL7_CUDA12.2"
-      - "GPU_OL8_CUDA12.2"
-      - "GPU_OL7_CUDA12.4"
-      - "GPU_OL8_CUDA12.4"
-    default: "HPC_OL7"
+      - "GPU_OL8_NV550"
+      - "GPU_OL7_NV550"
+      - "GPU_OL8_NV535"
+      - "GPU_OL7_NV535"
+    default: "HPC_OL8"
     visible: ${use_marketplace_image}
 
   use_compute_agent:
@@ -1681,11 +1681,11 @@ configuration\" "
     enum:
       - "HPC_OL7"
       - "HPC_OL8"
-      - "GPU_OL7_CUDA12.2"
-      - "GPU_OL8_CUDA12.2"
-      - "GPU_OL7_CUDA12.4"
-      - "GPU_OL8_CUDA12.4"
-    default: "HPC_OL7"
+      - "GPU_OL8_NV550"
+      - "GPU_OL7_NV550"
+      - "GPU_OL8_NV535"
+      - "GPU_OL7_NV535"
+    default: "HPC_OL8"
     visible:
       and:
         - ${use_marketplace_image_login}
diff --git a/variables.tf b/variables.tf
index c6067f23..8cc2c70b 100755
--- a/variables.tf
+++ b/variables.tf
@@ -87,12 +87,12 @@ variable "marketplace_listing" {
 variable "marketplace_version_id" { 
   type = map(string) 
   default = {
-       "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
-       "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.03.15-0"
-       "GPU_OL7_CUDA12.2" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0"
-       "GPU_OL8_CUDA12.2" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0"
-       "GPU_OL7_CUDA12.4" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0"
-       "GPU_OL8_CUDA12.4" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.4-2024.03.15-0"
+       "HPC_OL8" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-2024.05.08-0"
+       "HPC_OL7" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-2024.05.08-0"
+       "GPU_OL8_NV550" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.05.08-0"
+       "GPU_OL7_NV550" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-550-CUDA-12.4-2024.05.13-0"
+       "GPU_OL8_NV535" = "OracleLinux-8-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.05.08-0"
+       "GPU_OL7_NV535" = "OracleLinux-7-OCA-RHCK-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.05.13-0"
   }
 }
 

From ddf95e7ed2a1dfa0ef43293c32d418e266e42f24 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 15 May 2024 11:19:13 -0600
Subject: [PATCH 34/36] Fix default images

---
 schema.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/schema.yaml b/schema.yaml
index a2bea948..136c506a 100755
--- a/schema.yaml
+++ b/schema.yaml
@@ -422,7 +422,7 @@ variables:
       - "GPU_OL7_NV550"
       - "GPU_OL8_NV535"
       - "GPU_OL7_NV535"
-    default: "GPU_OL8_NV550"
+    default: "HPC_OL8"
     visible: ${use_marketplace_image_controller}
 
   controller_username: 
@@ -766,7 +766,7 @@ variables:
       - "GPU_OL7_NV550"
       - "GPU_OL8_NV535"
       - "GPU_OL7_NV535"
-    default: "HPC_OL8"
+    default: "GPU_OL8_NV550"
     visible: ${use_marketplace_image}
 
   use_compute_agent:

From b7af925469988971dc4de05bea4efef4348d6ad6 Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Wed, 15 May 2024 15:46:27 -0600
Subject: [PATCH 35/36] Make sure scratch is disabled by default

---
 variables.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/variables.tf b/variables.tf
index 8cc2c70b..fcee9d94 100755
--- a/variables.tf
+++ b/variables.tf
@@ -41,7 +41,7 @@ variable "use_compute_agent" { default = true }
 variable "unsupported_controller_image" { default = "" } 
 variable "unsupported_login_image" { default = "" } 
 variable "use_cluster_nfs" { default = true}
-variable "use_scratch_nfs" { default = true }
+variable "use_scratch_nfs" { default = false }
 variable "cluster_nfs_path" { default = "/nfs/cluster" } 
 variable "scratch_nfs_path" { default = "/nfs/scratch" } 
 variable "vcn_compartment" { default = ""}

From a89a1b2f00d1f915bd092245a2e1ab485382bdcd Mon Sep 17 00:00:00 2001
From: arnaudfroidmont <49765904+arnaudfroidmont@users.noreply.github.com>
Date: Fri, 17 May 2024 09:51:40 -0600
Subject: [PATCH 36/36] Update healthcheck boolean

---
 playbooks/roles/slurm/tasks/common.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml
index 755bc51f..865312ca 100755
--- a/playbooks/roles/slurm/tasks/common.yml
+++ b/playbooks/roles/slurm/tasks/common.yml
@@ -210,4 +210,5 @@
     dest: "{{ slurm_conf_path }}/prolog.d/healthchecks.sh"
     owner: root
     group: root
-    mode: '0755'
\ No newline at end of file
+    mode: '0755'
+  when: healthchecks|bool
\ No newline at end of file