From f9b16e65dd03f18c2727df72709d6fbeed88ccff Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 23 Mar 2023 17:40:10 -0700 Subject: [PATCH 01/41] add ordered file for NCCL when using srun with containers --- .../roles/rack-aware/files/node_ordering_by_rack.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py index f874595f..252b1c05 100644 --- a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py +++ b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py @@ -4,13 +4,17 @@ import argparse import subprocess -def write_ordered_hostfile(ordered_hosts=[],hostfile=None): +def write_ordered_hostfile(ordered_hosts=[],hostfile=None,srun=False): #ordered_hostfile="ordered_hostfile" if os.path.isfile(hostfile): os.remove(hostfile) fhandler = open(hostfile,"w") for h in ordered_hosts: - fhandler.write(h+"\n") + if srun: + for x in range(8): + fhandler.write(h+"\n") + else: + fhandler.write(h+"\n") fhandler.close() def write_ordered_rankfile(ordered_hosts=[],hostfile=None): @@ -104,6 +108,8 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None): write_ordered_hostfile(ordered_hosts,hostfile) hostfile="ordered_hostfile_system_name" write_ordered_hostfile(ordered_hosts_friendly_name,hostfile) +hostfile="ordered_hostfile_system_name_srun" +write_ordered_hostfile(ordered_hosts_friendly_name,hostfile,True) rankfile="rankfile_system_name" write_ordered_rankfile(ordered_hosts_friendly_name,rankfile) From ae45741f4570b2bababb5d84ae4ac0ffea698b03 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 24 Mar 2023 11:29:11 -0700 Subject: [PATCH 02/41] run nccl using containers with node ordering --- .../gpu/nccl_run_allreduce_containers.sbatch | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 samples/gpu/nccl_run_allreduce_containers.sbatch diff --git a/samples/gpu/nccl_run_allreduce_containers.sbatch b/samples/gpu/nccl_run_allreduce_containers.sbatch new file mode 100644 index 00000000..eff071f0 --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_containers.sbatch @@ -0,0 +1,95 @@ +#!/bin/bash +#SBATCH --job-name=nccl-allreduce-slurm-containers +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --exclusive +export PMI_DEBUG=1 + + +cd /nfs/scratch +mkdir $SLURM_JOB_ID +cd $SLURM_JOB_ID + +MACHINEFILE="hostfile" +ORDEREDMACHINEFILE="ordered_hostfile_system_name" +ORDEREDRANKMACHINEFILE="rankfile_system_name" +ORDEREDSRUNMACHINEFILE="ordered_hostfile_system_name_srun" + +scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE +echo MACHINEFILE +cat $MACHINEFILE + +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null + USER=opc +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null + USER=ubuntu +fi + +echo ORDEREDMACHINEFILE +cat $ORDEREDMACHINEFILE +echo ORDEREDSRUNMACHINEFILE +cat $ORDEREDSRUNMACHINEFILE + +export SLURM_HOSTFILE=$ORDEREDSRUNMACHINEFILE + +MPIVARS_PATH=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + +if [[ "$MPIVARS_PATH" == "" ]]; then + MPIVARS_PATH=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$MPIVARS_PATH" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $MPIVARS_PATH +LOCAL_MPI=${MPIVARS_PATH%%/bin*} + +#mpirun -d --mca pml ucx -x SLURM_JOB_NODELIST=$host_list --bind-to numa -x NCCL_DEBUG=WARN -x NCCL_IB_SL=0 -x NCCL_IB_TC=41 -x NCCL_IB_QPS_PER_CONNECTION=4 -x NCCL_IB_GID_INDEX=3 -x NCCL_ALGO=Ring -x NCCL_TOPO_FILE=/home/opc/topo-flattened-b4.xml -x NCCL_IB_HCA="mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_11,mlx5_12,mlx5_13,mlx5_16,mlx5_17,mlx5_18,mlx5_19" -x UCX_NET_DEVICES=mlx5_0:1 -x HCOLL_ENABLE_MCAST_ALL=0 -x coll_hcoll_enable=0 -x UCX_TLS=ud,self,sm -np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile rankfile_system_name /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 +# no need to pass: -x SLURM_JOB_NODELIST=$host_list + +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] +then + var_UCX_NET_DEVICES=mlx5_0:1 + var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12" +elif [ $shape == \"BM.GPU4.8\" ] +then + var_UCX_NET_DEVICES=mlx5_4:1 + var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17" +fi + +export RX_QUEUE_LEN=8192 \ + IB_RX_QUEUE_LEN=8192 \ + UCX_TLS=ud,self,sm \ + HCOLL_ENABLE_MCAST_ALL=0 \ + coll_hcoll_enable=0 \ + UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + NCCL_DEBUG=WARN \ + NCCL_IB_TIMEOUT=16 \ + NCCL_IB_SL=0 \ + NCCL_IB_TC=41 \ + NCCL_IGNORE_CPU_AFFINITY=1 \ + NCCL_IB_GID_INDEX=3 \ + NCCL_ALGO=Ring \ + NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ + OMPI_MCA_coll=^hcoll \ + NCCL_IB_QPS_PER_CONNECTION=4 + +env | grep "SLURMD_NODENAME=" + +CONTAINER_IMAGE="/nfs/scratch/nvcr.io+nvidia+pytorch+22.12-py3.sqsh" +CONTAINER_MOUNTS="/home/$USER/nccl-tests:/nccl,$LOCAL_MPI:$LOCAL_MPI" + +srun --mpi=pmi2 --gpus-per-node=$SBATCH_GPUS_PER_NODE \ + --ntasks-per-node=$SLURM_NTASKS_PER_NODE \ + --distribution=arbitrary \ + --container-image=$CONTAINER_IMAGE \ + --container-mounts=$CONTAINER_MOUNTS \ + bash -c " + source $MPIVARS_PATH && + /nccl/build/all_reduce_perf -b 1G -e 10G -i$((1024*1024*1024*9)) -n 100 + " \ No newline at end of file From 8a25e1cf66bb968b99178d88233243dce7cec07d Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Mar 2023 17:44:55 -0600 Subject: [PATCH 03/41] Switch to Slurm 23.02 (Only compiled for Ubuntu) --- playbooks/roles/slurm/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/defaults/main.yml b/playbooks/roles/slurm/defaults/main.yml index b83f4841..10fbd3f6 100755 --- a/playbooks/roles/slurm/defaults/main.yml +++ b/playbooks/roles/slurm/defaults/main.yml @@ -9,4 +9,4 @@ slurm_uid: 1501 munge_gid: 1500 munge_uid: 1500 rack_aware_playbook_suffix: "{% if rack_aware|bool %}-rack-aware{% endif%}" -slurm_version: "22.05.4-1" \ No newline at end of file +slurm_version: "23.02.0_1.0" \ No newline at end of file From f6d4a31cdbcd2404f00956325d2d6a33013012b1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Mar 2023 17:45:05 -0600 Subject: [PATCH 04/41] Fix the Ubuntu Linux install to match version --- playbooks/roles/slurm/tasks/common.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 24287a59..c5714504 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -94,7 +94,7 @@ - name: Install .deb vars: deb_name: - - "{{ download_path }}/slurm_rpms/slurm-22.05.4-1_amd64.deb" + - "{{ download_path }}/slurm_rpms/slurm-{{slurm_version}}_amd64.deb" package_state: present include_role: name: safe_yum From 48bfc2cf282e26e4c67e24563661e65937bda76c Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 28 Mar 2023 17:45:23 -0600 Subject: [PATCH 05/41] Add parameter to order the hostfile based on topo --- playbooks/roles/slurm/templates/slurm.conf.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 7ad0c47a..de07cc18 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -43,6 +43,7 @@ EnforcePartLimits=NO PropagateResourceLimitsExcept=MEMLOCK CommunicationParameters=NoAddrCache TopologyPlugin=topology/tree +TopologyParam=SwitchAsNodeRank TreeWidth=2048 SlurmctldParameters=enable_configless From 78954b73e10a0b6be39c31cc0802c2d1aa383902 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Wed, 29 Mar 2023 15:35:54 -0700 Subject: [PATCH 06/41] remove ncc-tests folder --- samples/nccl_compile/compile.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/samples/nccl_compile/compile.sh b/samples/nccl_compile/compile.sh index dbf37e8a..2e675f9c 100644 --- a/samples/nccl_compile/compile.sh +++ b/samples/nccl_compile/compile.sh @@ -16,12 +16,13 @@ MPI_HOME=${mpivars_path%%/bin*} source /etc/os-release if [ $ID == "ol" ] || [ $ID == "centos" ] ; then - cd /home/opc + USER=opc elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then - cd /home/ubuntu + USER=ubuntu fi - +cd /home/$USER +rm -rf nccl-tests git clone https://github.com/NVIDIA/nccl-tests.git cd nccl-tests/ make MPI=1 MPI_HOME=$MPI_HOME CUDA_HOME=/usr/local/cuda From 53b46ced86d53de1d38ffcb4dd26203bdaf005cb Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 3 Apr 2023 14:18:27 -0700 Subject: [PATCH 07/41] gres.conf update to properly detect GPUs in Slurm and corrected the CPU affinity cores for BM.GPU4.8 --- playbooks/roles/slurm/templates/gres.conf.j2 | 50 ++++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index f241cdd9..8854db24 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -2,43 +2,43 @@ {% for partition in queues %} {% for instance in partition.instance_types %} {% if instance.shape == "BM.GPU2.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] {% elif instance.shape == "VM.GPU2.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] {% elif instance.shape == "VM.GPU3.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] {% elif instance.shape == "VM.GPU3.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] {% elif instance.shape == "VM.GPU3.4"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] {% elif instance.shape == "BM.GPU3.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] {% elif instance.shape == "BM.GPU4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[24-31] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[8-15] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[56-63] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[40-47] {% elif instance.shape == "BM.GPU.B4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] {% elif instance.shape == "BM.GPU.A100-v2.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] {% elif instance.shape == "BM.GPU.T1.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] {% elif instance.shape == "BM.GPU.A10.4" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] AutoDetect=nvml -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] {% elif instance.shape == "VM.GPU.A10.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] {% elif instance.shape == "VM.GPU.A10.1" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] {% endif %} {% endfor %} {% endfor %} \ No newline at end of file From e216d3da704f2723669a035276805c6bf9e5be2e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 4 Apr 2023 12:46:31 -0600 Subject: [PATCH 08/41] Shorten the RackIDs to avoid scheduling errors --- playbooks/roles/slurm/tasks/compute-rack-aware.yml | 2 +- playbooks/roles/slurm/tasks/destroy-rack-aware.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 6da70b8f..ac62ccea 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -89,7 +89,7 @@ - name: Set RackID fact set_fact: - rackID: "{{ rackID_script.stdout[1:-1]}}" + rackID: "{{ rackID_script.stdout[1:-41]}}" - name: Get nodes from Inactive Switch vars: diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index fb4604d3..dc36daf7 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -82,7 +82,7 @@ - name: Get RackID set_fact: - rackID: "{{ rackID_script.stdout[1:-1]}}" + rackID: "{{ rackID_script.stdout[1:-41]}}" - name: Get rackIDs set_fact: From e5ee197b3e665f12bf8f9b3abab7e6529f2a7ee5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 4 Apr 2023 12:47:22 -0600 Subject: [PATCH 09/41] Run the bastion.sh later to avoid package issues --- bastion.tf | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/bastion.tf b/bastion.tf index 172d630f..94c0a687 100644 --- a/bastion.tf +++ b/bastion.tf @@ -173,23 +173,6 @@ resource "null_resource" "bastion" { private_key = tls_private_key.ssh.private_key_pem } } - - - provisioner "remote-exec" { - inline = [ - "#!/bin/bash", - "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key", - "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa", - "chmod a+x /opt/oci-hpc/bin/*.sh", - "timeout --foreground 60m /opt/oci-hpc/bin/bastion.sh" - ] - connection { - host = local.host - type = "ssh" - user = var.bastion_username - private_key = tls_private_key.ssh.private_key_pem - } - } } resource "null_resource" "cluster" { depends_on = [null_resource.bastion, null_resource.backup, oci_core_cluster_network.cluster_network, oci_core_instance.bastion, oci_core_volume_attachment.bastion_volume_attachment ] @@ -438,6 +421,10 @@ provisioner "file" { provisioner "remote-exec" { inline = [ "#!/bin/bash", + "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key", + "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa", + "chmod a+x /opt/oci-hpc/bin/*.sh", + "timeout --foreground 60m /opt/oci-hpc/bin/bastion.sh", "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", From 405b571e83df45128887bf188213c02705041d19 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 5 Apr 2023 16:34:26 -0600 Subject: [PATCH 10/41] Fix work request issue in case of inst-pool fail --- bin/create_cluster.sh | 111 ++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 52 deletions(-) diff --git a/bin/create_cluster.sh b/bin/create_cluster.sh index 0cdba575..486dcdb8 100755 --- a/bin/create_cluster.sh +++ b/bin/create_cluster.sh @@ -96,63 +96,70 @@ do end_timestamp=`date -u +'%F %T'` runtime=$((end-start)) if [ $status -eq 0 ] + then + echo "Successfully created $2 in $runtime seconds" + rm currently_building + if [ -f $monitoring_folder/activated ] then - echo "Successfully created $2 in $runtime seconds" - rm currently_building - if [ -f $monitoring_folder/activated ] - then - ocid=`tail $logs_folder/create_$2_${date}.log | grep "cluster_ocid =" | awk '{print $3}'` - ips=`tail $logs_folder/create_$2_${date}.log | grep "private_ips =" | awk '{print $3}'` - hostnames=`tail $logs_folder/create_$2_${date}.log | grep "hostnames =" | awk '{print $3}'` - ocids=`tail $logs_folder/create_$2_${date}.log | grep "ocids =" | awk '{print $3}'` - mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET cluster_OCID='${ocid:1:-1}',created='$end_timestamp',state='running',creation_time=SEC_TO_TIME($runtime) WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1 - export IFS="," - for ip in ${ips:1:-5}; do - ip_array+=( $ip ) - done - for ocid in ${ocids:1:-5}; do - ocid_array+=( $ocid ) - done - for hostname in ${hostnames:1:-1}; do - hostname_array+=( $hostname ) - done - for index in "${!ip_array[@]}"; do - mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE nodes SET created='$end_timestamp',state='running',hostname='${hostname_array[$index]}',ip='${ip_array[$index]}',node_OCID='${ocid_array[$index]}' WHERE cluster_id='$2_${date}' AND cluster_index=$(($index+1));" >> $logs_folder/create_$2_${date}.log 2>&1 - done - fi - break - else - ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error: | grep -o 'Output.*'` - if [ "$ERROR_MSG" == "" ] - then - ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error:` - fi - comp_tmp=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .compartmentId` - compartment_ocid=${comp_tmp:1:-1} + ocid=`tail $logs_folder/create_$2_${date}.log | grep "cluster_ocid =" | awk '{print $3}'` + ips=`tail $logs_folder/create_$2_${date}.log | grep "private_ips =" | awk '{print $3}'` + hostnames=`tail $logs_folder/create_$2_${date}.log | grep "hostnames =" | awk '{print $3}'` + ocids=`tail $logs_folder/create_$2_${date}.log | grep "ocids =" | awk '{print $3}'` + mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET cluster_OCID='${ocid:1:-1}',created='$end_timestamp',state='running',creation_time=SEC_TO_TIME($runtime) WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1 + export IFS="," + for ip in ${ips:1:-5}; do + ip_array+=( $ip ) + done + for ocid in ${ocids:1:-5}; do + ocid_array+=( $ocid ) + done + for hostname in ${hostnames:1:-1}; do + hostname_array+=( $hostname ) + done + for index in "${!ip_array[@]}"; do + mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE nodes SET created='$end_timestamp',state='running',hostname='${hostname_array[$index]}',ip='${ip_array[$index]}',node_OCID='${ocid_array[$index]}' WHERE cluster_id='$2_${date}' AND cluster_index=$(($index+1));" >> $logs_folder/create_$2_${date}.log 2>&1 + done + fi + break + else + ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error: | grep -o 'Output.*'` + if [ "$ERROR_MSG" == "" ] + then + ERROR_MSG=`cat $logs_folder/create_$2_${date}.log | grep Error:` + fi + comp_tmp=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .compartmentId` + compartment_ocid=${comp_tmp:1:-1} - inst_pool_ocid=`oci compute-management instance-pool list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1 - if [ "$inst_pool_ocid" == "" ] - then - inst_pool_work_request_error_messages="" - else - requestID=`oci work-requests work-request list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --resource-id ${inst_pool_ocid:1:-1} | jq '.data | .[] | select(."operation-type"=="LaunchInstancesInPool") | .id'` >> $logs_folder/create_$2_${date}.log 2>&1 - inst_pool_work_request_error_messages=`oci work-requests work-request-error list --work-request-id ${requestID:1:-1} --auth instance_principal --region $region --all | jq '.data | .[] | .message '` >> $logs_folder/create_$2_${date}.log 2>&1 - fi - if [ "$inst_pool_work_request_error_messages" == "" ] - then - cn_ocid=`oci compute-management cluster-network list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1 + inst_pool_ocid=`oci compute-management instance-pool list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1 + if [ "$inst_pool_ocid" == "" ] + then + inst_pool_work_request_error_messages="" + else + requestID=`oci work-requests work-request list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --resource-id ${inst_pool_ocid:1:-1} | jq '.data | .[] | select(."operation-type"=="LaunchInstancesInPool") | .id'` >> $logs_folder/create_$2_${date}.log 2>&1 + inst_pool_work_request_error_messages=`oci work-requests work-request-error list --work-request-id ${requestID:1:-1} --auth instance_principal --region $region --all | jq '.data | .[] | .message '` >> $logs_folder/create_$2_${date}.log 2>&1 + fi + if [ "$inst_pool_work_request_error_messages" == "" ] && [ "$cluster_network" == "true" ] + then + cn_ocid=`oci compute-management cluster-network list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --display-name $2 | jq '.data | sort_by(."time-created" | split(".") | .[0] | strptime("%Y-%m-%dT%H:%M:%S")) |.[-1] .id'` >> $logs_folder/create_$2_${date}.log 2>&1 + if [ "$cn_ocid" == "" ] + then + cn_work_request_error_messages="" + else requestID=`oci work-requests work-request list --compartment-id $compartment_ocid --auth instance_principal --region $region --all --resource-id ${cn_ocid:1:-1} | jq '.data | .[] | select(."operation-type"=="CreateClusterNetworkReservation") | .id'` >> $logs_folder/create_$2_${date}.log 2>&1 cn_work_request_error_messages=`oci work-requests work-request-log-entry list --work-request-id ${requestID:1:-1} --auth instance_principal --region $region --all | jq '.data | .[] | .message '` >> $logs_folder/create_$2_${date}.log 2>&1 - fi - echo "Could not create $2 with $1 nodes in $runtime seconds" - echo "$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages" | tee -a $logs_folder/create_$2_${date}.log 2>&1 + fi + else + cn_work_request_error_messages="" + fi + echo "Could not create $2 with $1 nodes in $runtime seconds" + echo "$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages" | tee -a $logs_folder/create_$2_${date}.log 2>&1 - if [ -f $monitoring_folder/activated ] - then - mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; INSERT INTO cluster_log.errors_timeserie (cluster_id,state,error_log,error_type,nodes,created_on_m,class_name) VALUES ('$2_${date}','creation','$logs_folder/create_$2_${date}.log','$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages','$1','$end_timestamp','$4');" >> $logs_folder/create_$2_${date}.log 2>&1 - mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET state='deleting',creation_error='`tail $logs_folder/create_$2_${date}.log | grep Error`' WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1 - fi - rm currently_building + if [ -f $monitoring_folder/activated ] + then + mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; INSERT INTO cluster_log.errors_timeserie (cluster_id,state,error_log,error_type,nodes,created_on_m,class_name) VALUES ('$2_${date}','creation','$logs_folder/create_$2_${date}.log','$ERROR_MSG $inst_pool_work_request_error_messages $cn_work_request_error_messages','$1','$end_timestamp','$4');" >> $logs_folder/create_$2_${date}.log 2>&1 + mysql -u $ENV_MYSQL_USER -p$ENV_MYSQL_PASS -e "use $ENV_MYSQL_DATABASE_NAME; UPDATE cluster_log.clusters SET state='deleting',creation_error='`tail $logs_folder/create_$2_${date}.log | grep Error`' WHERE id='$2_${date}';" >> $logs_folder/create_$2_${date}.log 2>&1 + fi + rm currently_building fi done From 360f31375841a995a3349bce0966a96f8f704406 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 5 Apr 2023 16:34:49 -0600 Subject: [PATCH 11/41] Formatting change --- autoscaling/crontab/autoscale_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index 7e2a0aa7..bc5ce204 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -329,7 +329,7 @@ def getstatus_slurm(): return cluster_to_build,cluster_to_destroy,nodes_to_destroy,cluster_building,cluster_destroying,used_index,current_nodes,building_nodes if os.path.isfile(lockfile): - print("Lockfile "+lockfile + " is present, exiting") + print( "Lockfile "+lockfile + " is present, exiting" ) exit() open(lockfile,'w').close() try: From af7210a1d27bc841608593dbb8a1418c661c8d0b Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 7 Apr 2023 13:29:51 -0600 Subject: [PATCH 12/41] Fix Slurm version for 23.02.1 --- playbooks/roles/slurm/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/slurm/defaults/main.yml b/playbooks/roles/slurm/defaults/main.yml index 10fbd3f6..8b0f3f40 100755 --- a/playbooks/roles/slurm/defaults/main.yml +++ b/playbooks/roles/slurm/defaults/main.yml @@ -9,4 +9,4 @@ slurm_uid: 1501 munge_gid: 1500 munge_uid: 1500 rack_aware_playbook_suffix: "{% if rack_aware|bool %}-rack-aware{% endif%}" -slurm_version: "23.02.0_1.0" \ No newline at end of file +slurm_version: "23.02.1-1" \ No newline at end of file From 2a663a70285166d367f17ec6121b1c80ce750f25 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 7 Apr 2023 13:39:46 -0600 Subject: [PATCH 13/41] Update terraform version to 4.115 --- autoscaling/tf_init/versions.tf | 2 +- versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 458fd9db..577f3255 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.112.0" + version = "4.115.0" } } } \ No newline at end of file diff --git a/versions.tf b/versions.tf index 458fd9db..577f3255 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.112.0" + version = "4.115.0" } } } \ No newline at end of file From 1621d6942a081616a9a9943a167cf0deaf8a256f Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 14 Apr 2023 16:58:35 -0600 Subject: [PATCH 14/41] Make Enroot directory writable to fix pyxis error --- playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml | 2 +- playbooks/roles/nvidia-enroot/tasks/ubuntu.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml index 41e0e56b..6d9ec324 100644 --- a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml +++ b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml @@ -110,7 +110,7 @@ file: path: "{{enroot_top_path_checked}}/{{item}}" state: directory - mode: '0775' + mode: '0777' owner: opc group: "{{privilege_group_name}}" recurse: no diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index cdbcaa00..65a700ad 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -106,7 +106,7 @@ file: path: "{{enroot_top_path_checked}}/{{item}}" state: directory - mode: '0775' + mode: '0777' owner: "{{ ansible_user }}" group: "{{privilege_group_name}}" recurse: no From 86b06aeb73dcdc8b4b041fa6bee64348e10384b5 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 14 Apr 2023 16:58:54 -0600 Subject: [PATCH 15/41] Make LDAP user part of priviledge group by default --- playbooks/roles/cluster-cli/files/cluster | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/playbooks/roles/cluster-cli/files/cluster b/playbooks/roles/cluster-cli/files/cluster index 2bf352bf..a91c2ebf 100755 --- a/playbooks/roles/cluster-cli/files/cluster +++ b/playbooks/roles/cluster-cli/files/cluster @@ -161,7 +161,7 @@ def list(): confirmation_prompt=True) @click.option('-n', '--name', prompt='Full Name', required=True) @click.option('-i', '--uid', default=None, help='Select the userID') -@click.option('-g', '--gid', default=None, help='Add to this groupID') +@click.option('-g', '--gid', default="9876", help='Add to this groupID') @click.option('-nossh', '--nossh', is_flag=True, default=False, help='Flag to not generate a user-specific ssh-key pair for passwordless ssh.') def add(user, password, uid, gid, name, nossh): """ add user """ From 7078f0ea6a692ddcc8ab7e98f5c03afb0277871f Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 14 Apr 2023 17:07:37 -0700 Subject: [PATCH 16/41] added NCCL with srun examples and running NCCL with containers example --- ...allreduce_containers_with_ordering.sbatch} | 0 samples/gpu/nccl_run_allreduce_srun.sbatch | 60 ++++++++++++++ samples/gpu/nccl_run_allreduce_srun.sh | 82 +++++++++++++++++++ 3 files changed, 142 insertions(+) rename samples/gpu/{nccl_run_allreduce_containers.sbatch => nccl_run_allreduce_containers_with_ordering.sbatch} (100%) create mode 100644 samples/gpu/nccl_run_allreduce_srun.sbatch create mode 100644 samples/gpu/nccl_run_allreduce_srun.sh diff --git a/samples/gpu/nccl_run_allreduce_containers.sbatch b/samples/gpu/nccl_run_allreduce_containers_with_ordering.sbatch similarity index 100% rename from samples/gpu/nccl_run_allreduce_containers.sbatch rename to samples/gpu/nccl_run_allreduce_containers_with_ordering.sbatch diff --git a/samples/gpu/nccl_run_allreduce_srun.sbatch b/samples/gpu/nccl_run_allreduce_srun.sbatch new file mode 100644 index 00000000..6f98f949 --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_srun.sbatch @@ -0,0 +1,60 @@ +#!/bin/bash +#SBATCH --job-name=nccl-allreduce-srun +#SBATCH --nodes=2 +#SBATCH --gpus-per-node=8 +#SBATCH --ntasks-per-node=8 +#SBATCH --exclusive +export PMI_DEBUG=1 + + +cd /nfs/scratch +mkdir $SLURM_JOB_ID +cd $SLURM_JOB_ID + +MACHINEFILE="hostfile" + +scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE +echo INPUTFILE +cat $MACHINEFILE + +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path +echo $mpivars_path + +USER=`whoami` + +shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` +if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] +then + var_UCX_NET_DEVICES=mlx5_0:1 + var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12" +elif [ $shape == \"BM.GPU4.8\" ] +then + var_UCX_NET_DEVICES=mlx5_4:1 + var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17" +fi + +export NCCL_DEBUG=WARN \ + OMPI_MCA_coll=^hcoll \ + RX_QUEUE_LEN=8192 \ + IB_RX_QUEUE_LEN=8192 \ + NCCL_IGNORE_CPU_AFFINITY=1 \ + NCCL_IB_SL=0 \ + NCCL_IB_TC=41 \ + NCCL_IB_QPS_PER_CONNECTION=4 \ + UCX_TLS=ud,self,sm \ + UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + HCOLL_ENABLE_MCAST_ALL=0 \ + coll_hcoll_enable=0 \ + NCCL_IB_GID_INDEX=3 \ + NCCL_ALGO=Ring \ + NCCL_IB_HCA="${var_NCCL_IB_HCA}" + srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 diff --git a/samples/gpu/nccl_run_allreduce_srun.sh b/samples/gpu/nccl_run_allreduce_srun.sh new file mode 100644 index 00000000..01cfb047 --- /dev/null +++ b/samples/gpu/nccl_run_allreduce_srun.sh @@ -0,0 +1,82 @@ +#!/bin/bash +set -e + +# number of times to run the nccl test to stress the GPUs and RDMA network. This is different from -n iterations parameter of nccl allreduce which is set below using $iter +max=$1 + +# This assumes that the hostfile passed is already ordered based on their rackId or slurm 23.02 and higher will order it based on topology +if [ -n "$2" ]; then + hostfile=$2 +else + hostfile="/tmp/ordered_hostfile_system_name" +fi + +echo INPUTFILE +cat $hostfile + +if [ -n "$3" ]; then + logfile=$3 +else + logfile="nccl_run_allreduce_srun.sh.log" +fi + +echo $logfile + +for x in $(seq 1 1 $max) +do + + echo $x + echo $x >> $logfile + date >> $logfile + + hostfile=$hostfile + + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + + if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` + fi + + if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + + source $mpivars_path + echo $mpivars_path + + USER=`whoami` + + first_node=`head $hostfile -n 1` + shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` + if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] + then + var_UCX_NET_DEVICES=mlx5_0:1 + var_NCCL_IB_HCA="=mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12" + elif [ $shape == \"BM.GPU4.8\" ] + then + var_UCX_NET_DEVICES=mlx5_4:1 + var_NCCL_IB_HCA="=mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17" + fi + + export NCCL_DEBUG=WARN \ + OMPI_MCA_coll=^hcoll \ + RX_QUEUE_LEN=8192 \ + IB_RX_QUEUE_LEN=8192 \ + NCCL_IGNORE_CPU_AFFINITY=1 \ + NCCL_IB_SL=0 \ + NCCL_IB_TC=41 \ + NCCL_IB_QPS_PER_CONNECTION=4 \ + UCX_TLS=ud,self,sm \ + UCX_NET_DEVICES=${var_UCX_NET_DEVICES} \ + HCOLL_ENABLE_MCAST_ALL=0 \ + coll_hcoll_enable=0 \ + NCCL_IB_GID_INDEX=3 \ + NCCL_ALGO=Ring \ + NCCL_IB_HCA="${var_NCCL_IB_HCA}" + srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile + + + + tail -n 32 $logfile + + +done \ No newline at end of file From 7ce26bf0a32150fde3ab9f2d48509f7d9a8bd6b5 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 14 Apr 2023 17:15:37 -0700 Subject: [PATCH 17/41] removed few commented lines from node_ordering python file --- playbooks/roles/rack-aware/files/node_ordering_by_rack.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py index 252b1c05..d86b2d07 100644 --- a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py +++ b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py @@ -51,18 +51,14 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None): from pssh.clients import ParallelSSHClient client = ParallelSSHClient(hosts) output = client.run_command('curl http://169.254.169.254/opc/v1/host/') - #print(output) for host_out in output: j = json.loads(bytearray(''.join(list(host_out.stdout)).encode())) - #print(j) if j['rackId'] in r: r[j['rackId']].append( host_out.host ) else: r[j['rackId']] = [ host_out.host ] hostname_output = client.run_command('/usr/bin/hostname') - #print(hostname_output) for host_out in hostname_output: - #j = bytearray(''.join(list(host_out.stdout)).encode()) j = bytearray(''.join(list(host_out.stdout)).encode()) friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii') #print(j.decode(encoding='ascii')+" "+host_out.host) From 73c7f368b027e0352b5e27de58ae3538a988f580 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Fri, 14 Apr 2023 23:01:11 -0600 Subject: [PATCH 18/41] Add PMIx to OL7 --- playbooks/roles/slurm/tasks/common.yml | 6 +++- playbooks/roles/slurm/tasks/common_pmix.yml | 38 +++++++++++++++++++++ playbooks/roles/slurm/vars/el_vars.yml | 11 +++++- 3 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 playbooks/roles/slurm/tasks/common_pmix.yml diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index c5714504..4b86b7f3 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -188,4 +188,8 @@ - name: Include pyxis prolog files include: common_pyxis.yml - when: pyxis|bool \ No newline at end of file + when: pyxis|bool + +- name: Include pyxis prolog files + include: common_pmix.yml + when: ansible_os_family == 'RedHat' \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/common_pmix.yml b/playbooks/roles/slurm/tasks/common_pmix.yml new file mode 100644 index 00000000..8d59fe2c --- /dev/null +++ b/playbooks/roles/slurm/tasks/common_pmix.yml @@ -0,0 +1,38 @@ +--- + +- name: install required packages + vars: + package_name: + - libev + - libev-devel + - libevent-devel + - hwloc + - hwloc-devel + include_role: + name: safe_yum + when: ansible_os_family == 'RedHat' + +- name: Download slurm .rpm + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/tgnPgvPv68JpWqLklTNY86rBsJ0z7Ebp3zs7Ud4X2_R8TZFgpm26kh08QHKI3dXU/n/hpc/b/source/o/pmix/{{item}}" + dest: "{{ download_path }}/slurm_rpms" + when: ansible_os_family == 'RedHat' and download_path == '/tmp' + with_items: "{{pmix_download_packages}}" + + +- name: Download slurm .rpm + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/tgnPgvPv68JpWqLklTNY86rBsJ0z7Ebp3zs7Ud4X2_R8TZFgpm26kh08QHKI3dXU/n/hpc/b/source/o/pmix/{{item}}" + dest: "{{ download_path }}/slurm_rpms" + when: ansible_os_family == 'RedHat' and download_path != '/tmp' + delegate_to: 127.0.0.1 + run_once: true + with_items: "{{ pmix_download_packages }}" + +- name: install PMIx packages RedHat + vars: + package_name: '{{ pmix_packages }}' + disable_gpg_check_var: True + include_role: + name: safe_yum + when: ansible_os_family == 'RedHat' \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/el_vars.yml b/playbooks/roles/slurm/vars/el_vars.yml index cabe9ef0..7296db1c 100644 --- a/playbooks/roles/slurm/vars/el_vars.yml +++ b/playbooks/roles/slurm/vars/el_vars.yml @@ -50,4 +50,13 @@ slurm_backup_server_packages: slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" \ No newline at end of file + - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + + +pmix_download_packages: + - "pmix-3.2.4-1.el7.x86_64.rpm" + - "pmix-devel-3.2.4-1.el7.x86_64.rpm" + +pmix_packages: + - "{{ download_path }}/slurm_rpms/pmix-3.2.4-1.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/pmix-devel-3.2.4-1.el7.x86_64.rpm" From 37a0e93c9e863481f880440dcdc9ab0611028f0f Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 17 Apr 2023 16:18:36 -0700 Subject: [PATCH 19/41] updated user in compile.sh --- samples/nccl_compile/compile.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/samples/nccl_compile/compile.sh b/samples/nccl_compile/compile.sh index 2e675f9c..f04b4cd2 100644 --- a/samples/nccl_compile/compile.sh +++ b/samples/nccl_compile/compile.sh @@ -14,12 +14,7 @@ if [[ "$mpivars_path" == "" ]]; then source $mpivars_path MPI_HOME=${mpivars_path%%/bin*} -source /etc/os-release -if [ $ID == "ol" ] || [ $ID == "centos" ] ; then - USER=opc -elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then - USER=ubuntu -fi +USER=`whoami` cd /home/$USER rm -rf nccl-tests From b7a2b2fd1633b42475127b9b792238ba4d7763e0 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 17 Apr 2023 17:22:51 -0600 Subject: [PATCH 20/41] Add PMIX to OL8 and CentOS slurm install --- playbooks/roles/slurm/vars/centos_vars.yml | 11 ++++++++++- playbooks/roles/slurm/vars/el_vars.yml | 8 ++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/playbooks/roles/slurm/vars/centos_vars.yml b/playbooks/roles/slurm/vars/centos_vars.yml index 7498933c..ed17b849 100644 --- a/playbooks/roles/slurm/vars/centos_vars.yml +++ b/playbooks/roles/slurm/vars/centos_vars.yml @@ -51,4 +51,13 @@ slurm_backup_server_packages: slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" - - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" \ No newline at end of file + - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" + + +pmix_download_packages: + - "pmix-centos-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "pmix-centos-devel-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" + +pmix_packages: + - "{{ download_path }}/slurm_rpms/pmix-centos-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/pmix-centos-devel-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/el_vars.yml b/playbooks/roles/slurm/vars/el_vars.yml index 7296db1c..d7149d6e 100644 --- a/playbooks/roles/slurm/vars/el_vars.yml +++ b/playbooks/roles/slurm/vars/el_vars.yml @@ -54,9 +54,9 @@ slurm_login_packages: pmix_download_packages: - - "pmix-3.2.4-1.el7.x86_64.rpm" - - "pmix-devel-3.2.4-1.el7.x86_64.rpm" + - "pmix-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "pmix-devel-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" pmix_packages: - - "{{ download_path }}/slurm_rpms/pmix-3.2.4-1.el7.x86_64.rpm" - - "{{ download_path }}/slurm_rpms/pmix-devel-3.2.4-1.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/pmix-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/pmix-devel-3.2.4-1.el{{ansible_distribution_major_version}}.x86_64.rpm" From 1da996907bb277e6695d53626902b87159d6f0f2 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Wed, 19 Apr 2023 09:07:18 -0600 Subject: [PATCH 21/41] RM hwloc-devel. Not needed, not always available --- playbooks/roles/slurm/tasks/common_pmix.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/playbooks/roles/slurm/tasks/common_pmix.yml b/playbooks/roles/slurm/tasks/common_pmix.yml index 8d59fe2c..630e2530 100644 --- a/playbooks/roles/slurm/tasks/common_pmix.yml +++ b/playbooks/roles/slurm/tasks/common_pmix.yml @@ -7,7 +7,6 @@ - libev-devel - libevent-devel - hwloc - - hwloc-devel include_role: name: safe_yum when: ansible_os_family == 'RedHat' From 93bee09baf2adbcad0b1356c20ed52a299b78894 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 25 Apr 2023 11:35:25 -0700 Subject: [PATCH 22/41] enable sssd and restart sshd --- playbooks/roles/sssd/tasks/debian.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/playbooks/roles/sssd/tasks/debian.yml b/playbooks/roles/sssd/tasks/debian.yml index 9e0c1d71..afac72c7 100644 --- a/playbooks/roles/sssd/tasks/debian.yml +++ b/playbooks/roles/sssd/tasks/debian.yml @@ -15,6 +15,7 @@ owner: 'root' group: 'root' mode: '0600' + notify: restart sssd - name: Copy CA certificate copy: @@ -53,11 +54,22 @@ include_role: name: safe_yum +- name: Enable sssd service + systemd: + name: sssd + enabled: "yes" + +- name: Start sssd service + systemd: + name: sssd + state: started + - name: Update sshd configuration lineinfile: path: /etc/ssh/sshd_config regexp: '^PasswordAuthentication' line: PasswordAuthentication no notify: + - restart sshd - restart ns daemons - restart login services From 86d14893e036dee0e21432388410c4b07ba74b16 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 25 Apr 2023 14:24:10 -0700 Subject: [PATCH 23/41] add mpivars.sh file for ubuntu --- playbooks/new_nodes.yml | 1 + playbooks/resize_add.yml | 1 + playbooks/roles/mpivars/.travis.yml | 29 +++++++++++ playbooks/roles/mpivars/README.md | 38 ++++++++++++++ playbooks/roles/mpivars/defaults/main.yml | 2 + playbooks/roles/mpivars/handlers/main.yml | 2 + playbooks/roles/mpivars/meta/main.yml | 52 ++++++++++++++++++++ playbooks/roles/mpivars/tasks/main.yml | 4 ++ playbooks/roles/mpivars/tasks/ubuntu.yml | 22 +++++++++ playbooks/roles/mpivars/templates/mpivars.j2 | 25 ++++++++++ playbooks/roles/mpivars/tests/inventory | 2 + playbooks/roles/mpivars/tests/test.yml | 5 ++ playbooks/roles/mpivars/vars/main.yml | 3 ++ playbooks/site.yml | 1 + 14 files changed, 187 insertions(+) create mode 100644 playbooks/roles/mpivars/.travis.yml create mode 100644 playbooks/roles/mpivars/README.md create mode 100644 playbooks/roles/mpivars/defaults/main.yml create mode 100644 playbooks/roles/mpivars/handlers/main.yml create mode 100644 playbooks/roles/mpivars/meta/main.yml create mode 100644 playbooks/roles/mpivars/tasks/main.yml create mode 100644 playbooks/roles/mpivars/tasks/ubuntu.yml create mode 100644 playbooks/roles/mpivars/templates/mpivars.j2 create mode 100644 playbooks/roles/mpivars/tests/inventory create mode 100644 playbooks/roles/mpivars/tests/test.yml create mode 100644 playbooks/roles/mpivars/vars/main.yml diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index c54b519f..c6f468fe 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -33,6 +33,7 @@ - limits - mpi-hostfiles - boot-volume + - mpivars - hosts: compute become: true diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 11ed903e..697f20ee 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -31,6 +31,7 @@ - ssh - limits - boot-volume + - mpivars - hosts: compute_to_add become: true diff --git a/playbooks/roles/mpivars/.travis.yml b/playbooks/roles/mpivars/.travis.yml new file mode 100644 index 00000000..36bbf620 --- /dev/null +++ b/playbooks/roles/mpivars/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/playbooks/roles/mpivars/README.md b/playbooks/roles/mpivars/README.md new file mode 100644 index 00000000..225dd44b --- /dev/null +++ b/playbooks/roles/mpivars/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/playbooks/roles/mpivars/defaults/main.yml b/playbooks/roles/mpivars/defaults/main.yml new file mode 100644 index 00000000..03c7ceb8 --- /dev/null +++ b/playbooks/roles/mpivars/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for mpivars diff --git a/playbooks/roles/mpivars/handlers/main.yml b/playbooks/roles/mpivars/handlers/main.yml new file mode 100644 index 00000000..b8d13e5e --- /dev/null +++ b/playbooks/roles/mpivars/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for mpivars diff --git a/playbooks/roles/mpivars/meta/main.yml b/playbooks/roles/mpivars/meta/main.yml new file mode 100644 index 00000000..c572acc9 --- /dev/null +++ b/playbooks/roles/mpivars/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/playbooks/roles/mpivars/tasks/main.yml b/playbooks/roles/mpivars/tasks/main.yml new file mode 100644 index 00000000..77964dc2 --- /dev/null +++ b/playbooks/roles/mpivars/tasks/main.yml @@ -0,0 +1,4 @@ +--- +# tasks file for mpivars +- include: ubuntu.yml + when: ansible_distribution == 'Ubuntu' diff --git a/playbooks/roles/mpivars/tasks/ubuntu.yml b/playbooks/roles/mpivars/tasks/ubuntu.yml new file mode 100644 index 00000000..9e4d01af --- /dev/null +++ b/playbooks/roles/mpivars/tasks/ubuntu.yml @@ -0,0 +1,22 @@ +- name: Check if mpi folder exists + stat: + path: /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin/ + register: directory + + +- name: Check if mpivars.sh exists + stat: + path: /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin/mpivars.sh + register: mpivars + when: directory.stat.exists + + +- name: Create mpivars.sh + become: true + template: + src: mpivars.j2 + dest: /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin/mpivars.sh + force: yes + owner: root + group: root + when: directory.stat.exists and not mpivars.stat.exists \ No newline at end of file diff --git a/playbooks/roles/mpivars/templates/mpivars.j2 b/playbooks/roles/mpivars/templates/mpivars.j2 new file mode 100644 index 00000000..29e22d05 --- /dev/null +++ b/playbooks/roles/mpivars/templates/mpivars.j2 @@ -0,0 +1,25 @@ +# NOTE: This is an automatically-generated file! (generated by the +# Open MPI/SHMEM RPM). Any changes made here will be lost if the RPM is +# uninstalled or upgraded. + +# PATH +if test -z "`echo $PATH | grep /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin`"; then + PATH=/usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin:${PATH} + export PATH +fi + +# LD_LIBRARY_PATH +if test -z "`echo $LD_LIBRARY_PATH | grep /usr/mpi/gcc/openmpi-{{ openmpi_version }}/lib`"; then + LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-{{ openmpi_version }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH} + export LD_LIBRARY_PATH +fi + +# MANPATH +if test -z "`echo $MANPATH | grep /usr/mpi/gcc/openmpi-{{ openmpi_version }}/share/man`"; then + MANPATH=/usr/mpi/gcc/openmpi-{{ openmpi_version }}/share/man:${MANPATH} + export MANPATH +fi + +# MPI_ROOT +MPI_ROOT=/usr/mpi/gcc/openmpi-{{ openmpi_version }} +export MPI_ROOT \ No newline at end of file diff --git a/playbooks/roles/mpivars/tests/inventory b/playbooks/roles/mpivars/tests/inventory new file mode 100644 index 00000000..878877b0 --- /dev/null +++ b/playbooks/roles/mpivars/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/playbooks/roles/mpivars/tests/test.yml b/playbooks/roles/mpivars/tests/test.yml new file mode 100644 index 00000000..e933cd78 --- /dev/null +++ b/playbooks/roles/mpivars/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - mpivars diff --git a/playbooks/roles/mpivars/vars/main.yml b/playbooks/roles/mpivars/vars/main.yml new file mode 100644 index 00000000..b585780c --- /dev/null +++ b/playbooks/roles/mpivars/vars/main.yml @@ -0,0 +1,3 @@ +--- +# vars file for mpivars +openmpi_version: 4.1.2a1 \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index abee7284..f05af732 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -35,6 +35,7 @@ - mpi-hostfiles - etc-hosts - boot-volume + - mpivars - hosts: all become: true From a5cd3e0934670fc202d4daff6fd170fa39948b97 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 25 Apr 2023 15:28:45 -0700 Subject: [PATCH 24/41] initial commit for loading nvidia_peermem module --- playbooks/new_nodes.yml | 3 ++ playbooks/resize_add.yml | 3 ++ playbooks/roles/nvidia_peermem/.travis.yml | 29 +++++++++++ playbooks/roles/nvidia_peermem/README.md | 38 ++++++++++++++ .../roles/nvidia_peermem/defaults/main.yml | 2 + .../roles/nvidia_peermem/handlers/main.yml | 2 + playbooks/roles/nvidia_peermem/meta/main.yml | 52 +++++++++++++++++++ .../roles/nvidia_peermem/tasks/common.yml | 15 ++++++ playbooks/roles/nvidia_peermem/tasks/main.yml | 3 ++ .../roles/nvidia_peermem/tests/inventory | 2 + playbooks/roles/nvidia_peermem/tests/test.yml | 5 ++ playbooks/roles/nvidia_peermem/vars/main.yml | 2 + playbooks/site.yml | 3 ++ 13 files changed, 159 insertions(+) create mode 100644 playbooks/roles/nvidia_peermem/.travis.yml create mode 100644 playbooks/roles/nvidia_peermem/README.md create mode 100644 playbooks/roles/nvidia_peermem/defaults/main.yml create mode 100644 playbooks/roles/nvidia_peermem/handlers/main.yml create mode 100644 playbooks/roles/nvidia_peermem/meta/main.yml create mode 100644 playbooks/roles/nvidia_peermem/tasks/common.yml create mode 100644 playbooks/roles/nvidia_peermem/tasks/main.yml create mode 100644 playbooks/roles/nvidia_peermem/tests/inventory create mode 100644 playbooks/roles/nvidia_peermem/tests/test.yml create mode 100644 playbooks/roles/nvidia_peermem/vars/main.yml diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index c54b519f..690abb1f 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -48,6 +48,9 @@ - include_role: name: rdma-interface when: cluster_network|bool + - include_role: + name: nvidia_peermem + when: cluster_network|bool - hosts: bastion,slurm_backup,login,compute become: true diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 11ed903e..07762aa1 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -46,6 +46,9 @@ - include_role: name: rdma-interface when: cluster_network|bool + - include_role: + name: nvidia_peermem + when: cluster_network|bool - hosts: bastion,slurm_backup,login,compute become: true diff --git a/playbooks/roles/nvidia_peermem/.travis.yml b/playbooks/roles/nvidia_peermem/.travis.yml new file mode 100644 index 00000000..36bbf620 --- /dev/null +++ b/playbooks/roles/nvidia_peermem/.travis.yml @@ -0,0 +1,29 @@ +--- +language: python +python: "2.7" + +# Use the new container infrastructure +sudo: false + +# Install ansible +addons: + apt: + packages: + - python-pip + +install: + # Install ansible + - pip install ansible + + # Check ansible version + - ansible --version + + # Create ansible.cfg with correct roles_path + - printf '[defaults]\nroles_path=../' >ansible.cfg + +script: + # Basic role syntax check + - ansible-playbook tests/test.yml -i tests/inventory --syntax-check + +notifications: + webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/playbooks/roles/nvidia_peermem/README.md b/playbooks/roles/nvidia_peermem/README.md new file mode 100644 index 00000000..225dd44b --- /dev/null +++ b/playbooks/roles/nvidia_peermem/README.md @@ -0,0 +1,38 @@ +Role Name +========= + +A brief description of the role goes here. + +Requirements +------------ + +Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. + +Role Variables +-------------- + +A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. + +Dependencies +------------ + +A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. + +Example Playbook +---------------- + +Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: + + - hosts: servers + roles: + - { role: username.rolename, x: 42 } + +License +------- + +BSD + +Author Information +------------------ + +An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/playbooks/roles/nvidia_peermem/defaults/main.yml b/playbooks/roles/nvidia_peermem/defaults/main.yml new file mode 100644 index 00000000..8abfb007 --- /dev/null +++ b/playbooks/roles/nvidia_peermem/defaults/main.yml @@ -0,0 +1,2 @@ +--- +# defaults file for nvidia_peermem diff --git a/playbooks/roles/nvidia_peermem/handlers/main.yml b/playbooks/roles/nvidia_peermem/handlers/main.yml new file mode 100644 index 00000000..d2bbc816 --- /dev/null +++ b/playbooks/roles/nvidia_peermem/handlers/main.yml @@ -0,0 +1,2 @@ +--- +# handlers file for nvidia_peermem diff --git a/playbooks/roles/nvidia_peermem/meta/main.yml b/playbooks/roles/nvidia_peermem/meta/main.yml new file mode 100644 index 00000000..c572acc9 --- /dev/null +++ b/playbooks/roles/nvidia_peermem/meta/main.yml @@ -0,0 +1,52 @@ +galaxy_info: + author: your name + description: your role description + company: your company (optional) + + # If the issue tracker for your role is not on github, uncomment the + # next line and provide a value + # issue_tracker_url: http://example.com/issue/tracker + + # Choose a valid license ID from https://spdx.org - some suggested licenses: + # - BSD-3-Clause (default) + # - MIT + # - GPL-2.0-or-later + # - GPL-3.0-only + # - Apache-2.0 + # - CC-BY-4.0 + license: license (GPL-2.0-or-later, MIT, etc) + + min_ansible_version: 2.1 + + # If this a Container Enabled role, provide the minimum Ansible Container version. + # min_ansible_container_version: + + # + # Provide a list of supported platforms, and for each platform a list of versions. + # If you don't wish to enumerate all versions for a particular platform, use 'all'. + # To view available platforms and versions (or releases), visit: + # https://galaxy.ansible.com/api/v1/platforms/ + # + # platforms: + # - name: Fedora + # versions: + # - all + # - 25 + # - name: SomePlatform + # versions: + # - all + # - 1.0 + # - 7 + # - 99.99 + + galaxy_tags: [] + # List tags for your role here, one per line. A tag is a keyword that describes + # and categorizes the role. Users find roles by searching for tags. Be sure to + # remove the '[]' above, if you add tags to this list. + # + # NOTE: A tag is limited to a single word comprised of alphanumeric characters. + # Maximum 20 tags per role. + +dependencies: [] + # List your role dependencies here, one per line. Be sure to remove the '[]' above, + # if you add dependencies to this list. diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml new file mode 100644 index 00000000..819f0f3c --- /dev/null +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -0,0 +1,15 @@ +--- +- name: Check if nvidia drivers are installed + shell: lsmod | grep nvidia_peermem | wc -l + register: result + + +- name: Check if nvidia_peermem module is loaded + shell: lsmod | grep nvidia_peermem | wc -l + register: result + + +- name: Load nvidia_peermem module + become: true + shell: modprobe nvidia_peermem + when: register.stdout != 3 \ No newline at end of file diff --git a/playbooks/roles/nvidia_peermem/tasks/main.yml b/playbooks/roles/nvidia_peermem/tasks/main.yml new file mode 100644 index 00000000..b5245a5e --- /dev/null +++ b/playbooks/roles/nvidia_peermem/tasks/main.yml @@ -0,0 +1,3 @@ +--- +# tasks file for nvidia_peermem +- include: common.yml \ No newline at end of file diff --git a/playbooks/roles/nvidia_peermem/tests/inventory b/playbooks/roles/nvidia_peermem/tests/inventory new file mode 100644 index 00000000..878877b0 --- /dev/null +++ b/playbooks/roles/nvidia_peermem/tests/inventory @@ -0,0 +1,2 @@ +localhost + diff --git a/playbooks/roles/nvidia_peermem/tests/test.yml b/playbooks/roles/nvidia_peermem/tests/test.yml new file mode 100644 index 00000000..fd3fd3ed --- /dev/null +++ b/playbooks/roles/nvidia_peermem/tests/test.yml @@ -0,0 +1,5 @@ +--- +- hosts: localhost + remote_user: root + roles: + - nvidia_peermem diff --git a/playbooks/roles/nvidia_peermem/vars/main.yml b/playbooks/roles/nvidia_peermem/vars/main.yml new file mode 100644 index 00000000..355e4cb1 --- /dev/null +++ b/playbooks/roles/nvidia_peermem/vars/main.yml @@ -0,0 +1,2 @@ +--- +# vars file for nvidia_peermem diff --git a/playbooks/site.yml b/playbooks/site.yml index abee7284..d4be8745 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -50,6 +50,9 @@ - include_role: name: rdma-interface when: cluster_network|bool + - include_role: + name: nvidia_peermem + when: cluster_network|bool - hosts: bastion become: true From fd2a954e06a4944bc7286989f845a85ea52566dd Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 25 Apr 2023 15:33:33 -0700 Subject: [PATCH 25/41] removing unnecessary files --- playbooks/roles/mpivars/.travis.yml | 29 ------------- playbooks/roles/mpivars/README.md | 38 ----------------- playbooks/roles/mpivars/handlers/main.yml | 2 - playbooks/roles/mpivars/meta/main.yml | 52 ----------------------- playbooks/roles/mpivars/tests/inventory | 2 - playbooks/roles/mpivars/tests/test.yml | 5 --- 6 files changed, 128 deletions(-) delete mode 100644 playbooks/roles/mpivars/.travis.yml delete mode 100644 playbooks/roles/mpivars/README.md delete mode 100644 playbooks/roles/mpivars/handlers/main.yml delete mode 100644 playbooks/roles/mpivars/meta/main.yml delete mode 100644 playbooks/roles/mpivars/tests/inventory delete mode 100644 playbooks/roles/mpivars/tests/test.yml diff --git a/playbooks/roles/mpivars/.travis.yml b/playbooks/roles/mpivars/.travis.yml deleted file mode 100644 index 36bbf620..00000000 --- a/playbooks/roles/mpivars/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -language: python -python: "2.7" - -# Use the new container infrastructure -sudo: false - -# Install ansible -addons: - apt: - packages: - - python-pip - -install: - # Install ansible - - pip install ansible - - # Check ansible version - - ansible --version - - # Create ansible.cfg with correct roles_path - - printf '[defaults]\nroles_path=../' >ansible.cfg - -script: - # Basic role syntax check - - ansible-playbook tests/test.yml -i tests/inventory --syntax-check - -notifications: - webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/playbooks/roles/mpivars/README.md b/playbooks/roles/mpivars/README.md deleted file mode 100644 index 225dd44b..00000000 --- a/playbooks/roles/mpivars/README.md +++ /dev/null @@ -1,38 +0,0 @@ -Role Name -========= - -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. - -Role Variables --------------- - -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. - -Dependencies ------------- - -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } - -License -------- - -BSD - -Author Information ------------------- - -An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/playbooks/roles/mpivars/handlers/main.yml b/playbooks/roles/mpivars/handlers/main.yml deleted file mode 100644 index b8d13e5e..00000000 --- a/playbooks/roles/mpivars/handlers/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -# handlers file for mpivars diff --git a/playbooks/roles/mpivars/meta/main.yml b/playbooks/roles/mpivars/meta/main.yml deleted file mode 100644 index c572acc9..00000000 --- a/playbooks/roles/mpivars/meta/main.yml +++ /dev/null @@ -1,52 +0,0 @@ -galaxy_info: - author: your name - description: your role description - company: your company (optional) - - # If the issue tracker for your role is not on github, uncomment the - # next line and provide a value - # issue_tracker_url: http://example.com/issue/tracker - - # Choose a valid license ID from https://spdx.org - some suggested licenses: - # - BSD-3-Clause (default) - # - MIT - # - GPL-2.0-or-later - # - GPL-3.0-only - # - Apache-2.0 - # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.1 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: - - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 - - galaxy_tags: [] - # List tags for your role here, one per line. A tag is a keyword that describes - # and categorizes the role. Users find roles by searching for tags. Be sure to - # remove the '[]' above, if you add tags to this list. - # - # NOTE: A tag is limited to a single word comprised of alphanumeric characters. - # Maximum 20 tags per role. - -dependencies: [] - # List your role dependencies here, one per line. Be sure to remove the '[]' above, - # if you add dependencies to this list. diff --git a/playbooks/roles/mpivars/tests/inventory b/playbooks/roles/mpivars/tests/inventory deleted file mode 100644 index 878877b0..00000000 --- a/playbooks/roles/mpivars/tests/inventory +++ /dev/null @@ -1,2 +0,0 @@ -localhost - diff --git a/playbooks/roles/mpivars/tests/test.yml b/playbooks/roles/mpivars/tests/test.yml deleted file mode 100644 index e933cd78..00000000 --- a/playbooks/roles/mpivars/tests/test.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -- hosts: localhost - remote_user: root - roles: - - mpivars From 3e74d904a9323200c78ca107598617fa952545b8 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 25 Apr 2023 16:21:32 -0700 Subject: [PATCH 26/41] check if nvidia driver installed and then if peermem is loaded. if not load the module nvidia_peermem --- playbooks/new_nodes.yml | 1 - playbooks/resize_add.yml | 1 - playbooks/roles/nvidia_peermem/.travis.yml | 29 ----------- playbooks/roles/nvidia_peermem/README.md | 38 -------------- .../roles/nvidia_peermem/handlers/main.yml | 2 - playbooks/roles/nvidia_peermem/meta/main.yml | 52 ------------------- .../roles/nvidia_peermem/tasks/common.yml | 7 +-- .../roles/nvidia_peermem/tests/inventory | 2 - playbooks/roles/nvidia_peermem/tests/test.yml | 5 -- playbooks/site.yml | 1 - 10 files changed, 4 insertions(+), 134 deletions(-) delete mode 100644 playbooks/roles/nvidia_peermem/.travis.yml delete mode 100644 playbooks/roles/nvidia_peermem/README.md delete mode 100644 playbooks/roles/nvidia_peermem/handlers/main.yml delete mode 100644 playbooks/roles/nvidia_peermem/meta/main.yml delete mode 100644 playbooks/roles/nvidia_peermem/tests/inventory delete mode 100644 playbooks/roles/nvidia_peermem/tests/test.yml diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 690abb1f..5d3e4108 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -50,7 +50,6 @@ when: cluster_network|bool - include_role: name: nvidia_peermem - when: cluster_network|bool - hosts: bastion,slurm_backup,login,compute become: true diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 07762aa1..644e9e5f 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -48,7 +48,6 @@ when: cluster_network|bool - include_role: name: nvidia_peermem - when: cluster_network|bool - hosts: bastion,slurm_backup,login,compute become: true diff --git a/playbooks/roles/nvidia_peermem/.travis.yml b/playbooks/roles/nvidia_peermem/.travis.yml deleted file mode 100644 index 36bbf620..00000000 --- a/playbooks/roles/nvidia_peermem/.travis.yml +++ /dev/null @@ -1,29 +0,0 @@ ---- -language: python -python: "2.7" - -# Use the new container infrastructure -sudo: false - -# Install ansible -addons: - apt: - packages: - - python-pip - -install: - # Install ansible - - pip install ansible - - # Check ansible version - - ansible --version - - # Create ansible.cfg with correct roles_path - - printf '[defaults]\nroles_path=../' >ansible.cfg - -script: - # Basic role syntax check - - ansible-playbook tests/test.yml -i tests/inventory --syntax-check - -notifications: - webhooks: https://galaxy.ansible.com/api/v1/notifications/ \ No newline at end of file diff --git a/playbooks/roles/nvidia_peermem/README.md b/playbooks/roles/nvidia_peermem/README.md deleted file mode 100644 index 225dd44b..00000000 --- a/playbooks/roles/nvidia_peermem/README.md +++ /dev/null @@ -1,38 +0,0 @@ -Role Name -========= - -A brief description of the role goes here. - -Requirements ------------- - -Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. - -Role Variables --------------- - -A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. - -Dependencies ------------- - -A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. - -Example Playbook ----------------- - -Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: - - - hosts: servers - roles: - - { role: username.rolename, x: 42 } - -License -------- - -BSD - -Author Information ------------------- - -An optional section for the role authors to include contact information, or a website (HTML is not allowed). diff --git a/playbooks/roles/nvidia_peermem/handlers/main.yml b/playbooks/roles/nvidia_peermem/handlers/main.yml deleted file mode 100644 index d2bbc816..00000000 --- a/playbooks/roles/nvidia_peermem/handlers/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -# handlers file for nvidia_peermem diff --git a/playbooks/roles/nvidia_peermem/meta/main.yml b/playbooks/roles/nvidia_peermem/meta/main.yml deleted file mode 100644 index c572acc9..00000000 --- a/playbooks/roles/nvidia_peermem/meta/main.yml +++ /dev/null @@ -1,52 +0,0 @@ -galaxy_info: - author: your name - description: your role description - company: your company (optional) - - # If the issue tracker for your role is not on github, uncomment the - # next line and provide a value - # issue_tracker_url: http://example.com/issue/tracker - - # Choose a valid license ID from https://spdx.org - some suggested licenses: - # - BSD-3-Clause (default) - # - MIT - # - GPL-2.0-or-later - # - GPL-3.0-only - # - Apache-2.0 - # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) - - min_ansible_version: 2.1 - - # If this a Container Enabled role, provide the minimum Ansible Container version. - # min_ansible_container_version: - - # - # Provide a list of supported platforms, and for each platform a list of versions. - # If you don't wish to enumerate all versions for a particular platform, use 'all'. - # To view available platforms and versions (or releases), visit: - # https://galaxy.ansible.com/api/v1/platforms/ - # - # platforms: - # - name: Fedora - # versions: - # - all - # - 25 - # - name: SomePlatform - # versions: - # - all - # - 1.0 - # - 7 - # - 99.99 - - galaxy_tags: [] - # List tags for your role here, one per line. A tag is a keyword that describes - # and categorizes the role. Users find roles by searching for tags. Be sure to - # remove the '[]' above, if you add tags to this list. - # - # NOTE: A tag is limited to a single word comprised of alphanumeric characters. - # Maximum 20 tags per role. - -dependencies: [] - # List your role dependencies here, one per line. Be sure to remove the '[]' above, - # if you add dependencies to this list. diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index 819f0f3c..26a68298 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -1,15 +1,16 @@ --- - name: Check if nvidia drivers are installed - shell: lsmod | grep nvidia_peermem | wc -l - register: result + shell: cat /sys/module/nvidia/version | wc -l + register: gpu - name: Check if nvidia_peermem module is loaded shell: lsmod | grep nvidia_peermem | wc -l register: result + when: gpu.stdout == '1' - name: Load nvidia_peermem module become: true shell: modprobe nvidia_peermem - when: register.stdout != 3 \ No newline at end of file + when: gpu.stdout == '1' and result.stdout != '3' \ No newline at end of file diff --git a/playbooks/roles/nvidia_peermem/tests/inventory b/playbooks/roles/nvidia_peermem/tests/inventory deleted file mode 100644 index 878877b0..00000000 --- a/playbooks/roles/nvidia_peermem/tests/inventory +++ /dev/null @@ -1,2 +0,0 @@ -localhost - diff --git a/playbooks/roles/nvidia_peermem/tests/test.yml b/playbooks/roles/nvidia_peermem/tests/test.yml deleted file mode 100644 index fd3fd3ed..00000000 --- a/playbooks/roles/nvidia_peermem/tests/test.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -- hosts: localhost - remote_user: root - roles: - - nvidia_peermem diff --git a/playbooks/site.yml b/playbooks/site.yml index d4be8745..006a3bdb 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -52,7 +52,6 @@ when: cluster_network|bool - include_role: name: nvidia_peermem - when: cluster_network|bool - hosts: bastion become: true From de151eff9b7d4eca408e3611d18311dede06993a Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 25 Apr 2023 17:09:02 -0700 Subject: [PATCH 27/41] find the openmpi version and use that to create mpivars.sh file if it doesn't exist --- playbooks/roles/mpivars/tasks/ubuntu.yml | 16 ++++++++-------- playbooks/roles/mpivars/templates/mpivars.j2 | 14 +++++++------- playbooks/roles/mpivars/vars/main.yml | 3 +-- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/playbooks/roles/mpivars/tasks/ubuntu.yml b/playbooks/roles/mpivars/tasks/ubuntu.yml index 9e4d01af..d84748a9 100644 --- a/playbooks/roles/mpivars/tasks/ubuntu.yml +++ b/playbooks/roles/mpivars/tasks/ubuntu.yml @@ -1,22 +1,22 @@ -- name: Check if mpi folder exists - stat: - path: /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin/ - register: directory +--- +- name: Get the openmpi version + shell: ls /usr/mpi/gcc/ + register: openmpi - name: Check if mpivars.sh exists stat: - path: /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin/mpivars.sh + path: /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin/mpivars.sh register: mpivars - when: directory.stat.exists + when: openmpi.stdout_lines[0] != "" - name: Create mpivars.sh become: true template: src: mpivars.j2 - dest: /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin/mpivars.sh + dest: /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin/mpivars.sh force: yes owner: root group: root - when: directory.stat.exists and not mpivars.stat.exists \ No newline at end of file + when: openmpi.stdout_lines[0] != "" and not mpivars.stat.exists \ No newline at end of file diff --git a/playbooks/roles/mpivars/templates/mpivars.j2 b/playbooks/roles/mpivars/templates/mpivars.j2 index 29e22d05..5f670b75 100644 --- a/playbooks/roles/mpivars/templates/mpivars.j2 +++ b/playbooks/roles/mpivars/templates/mpivars.j2 @@ -3,23 +3,23 @@ # uninstalled or upgraded. # PATH -if test -z "`echo $PATH | grep /usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin`"; then - PATH=/usr/mpi/gcc/openmpi-{{ openmpi_version }}/bin:${PATH} +if test -z "`echo $PATH | grep /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin`"; then + PATH=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin:${PATH} export PATH fi # LD_LIBRARY_PATH -if test -z "`echo $LD_LIBRARY_PATH | grep /usr/mpi/gcc/openmpi-{{ openmpi_version }}/lib`"; then - LD_LIBRARY_PATH=/usr/mpi/gcc/openmpi-{{ openmpi_version }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH} +if test -z "`echo $LD_LIBRARY_PATH | grep /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/lib`"; then + LD_LIBRARY_PATH=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/lib${LD_LIBRARY_PATH:+:}${LD_LIBRARY_PATH} export LD_LIBRARY_PATH fi # MANPATH -if test -z "`echo $MANPATH | grep /usr/mpi/gcc/openmpi-{{ openmpi_version }}/share/man`"; then - MANPATH=/usr/mpi/gcc/openmpi-{{ openmpi_version }}/share/man:${MANPATH} +if test -z "`echo $MANPATH | grep /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/share/man`"; then + MANPATH=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/share/man:${MANPATH} export MANPATH fi # MPI_ROOT -MPI_ROOT=/usr/mpi/gcc/openmpi-{{ openmpi_version }} +MPI_ROOT=/usr/mpi/gcc/{{ openmpi.stdout_lines[0] }} export MPI_ROOT \ No newline at end of file diff --git a/playbooks/roles/mpivars/vars/main.yml b/playbooks/roles/mpivars/vars/main.yml index b585780c..ae77cf28 100644 --- a/playbooks/roles/mpivars/vars/main.yml +++ b/playbooks/roles/mpivars/vars/main.yml @@ -1,3 +1,2 @@ --- -# vars file for mpivars -openmpi_version: 4.1.2a1 \ No newline at end of file +# vars file for mpivars \ No newline at end of file From 02f50d2a8fcdc90cc218de402ee07adc579e827c Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 27 Apr 2023 12:18:50 -0700 Subject: [PATCH 28/41] check if its a GPU shape --- playbooks/roles/nvidia_peermem/tasks/common.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/playbooks/roles/nvidia_peermem/tasks/common.yml b/playbooks/roles/nvidia_peermem/tasks/common.yml index 26a68298..51971505 100644 --- a/playbooks/roles/nvidia_peermem/tasks/common.yml +++ b/playbooks/roles/nvidia_peermem/tasks/common.yml @@ -1,16 +1,25 @@ --- +- name: Check if its a GPU shape + shell: + cmd: "curl -sH \"Authorization: Bearer Oracle\" -L http://169.254.169.254/opc/v2/instance/ | jq .shape | grep GPU" + warn: false + register: shape + failed_when: false + + - name: Check if nvidia drivers are installed shell: cat /sys/module/nvidia/version | wc -l - register: gpu + register: nvidia + when: shape.stdout != "" - name: Check if nvidia_peermem module is loaded shell: lsmod | grep nvidia_peermem | wc -l register: result - when: gpu.stdout == '1' + when: shape.stdout != "" and nvidia.stdout == '1' - name: Load nvidia_peermem module become: true shell: modprobe nvidia_peermem - when: gpu.stdout == '1' and result.stdout != '3' \ No newline at end of file + when: shape.stdout != "" and nvidia.stdout == '1' and result.stdout != '3' \ No newline at end of file From 2dea06ab40d6374879b054ab40a9b273bc39efb7 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 27 Apr 2023 12:23:32 -0700 Subject: [PATCH 29/41] ignore if there is no mpi in /usr/mpi/gcc/ --- playbooks/roles/mpivars/tasks/ubuntu.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/playbooks/roles/mpivars/tasks/ubuntu.yml b/playbooks/roles/mpivars/tasks/ubuntu.yml index d84748a9..bc013438 100644 --- a/playbooks/roles/mpivars/tasks/ubuntu.yml +++ b/playbooks/roles/mpivars/tasks/ubuntu.yml @@ -1,7 +1,10 @@ --- - name: Get the openmpi version - shell: ls /usr/mpi/gcc/ + shell: + cmd: ls /usr/mpi/gcc/ + warn: false register: openmpi + failed_when: false - name: Check if mpivars.sh exists From e4904a0a4893596fe1be59f33be807ae5107e835 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Fri, 28 Apr 2023 16:48:10 -0700 Subject: [PATCH 30/41] restart nscd and ssd on bastion and all nodes as the last step --- playbooks/new_nodes.yml | 9 ++++++++- playbooks/resize_add.yml | 9 ++++++++- playbooks/roles/fix_ldap/tasks/main.yml | 2 ++ playbooks/roles/fix_ldap/tasks/ubuntu.yml | 16 ++++++++++++++++ playbooks/site.yml | 7 +++++++ 5 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 playbooks/roles/fix_ldap/tasks/main.yml create mode 100644 playbooks/roles/fix_ldap/tasks/ubuntu.yml diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index c54b519f..891e1673 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -193,4 +193,11 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool \ No newline at end of file + when: monitoring|default(false)|bool + +- hosts: all + become: true + tasks: + - include_role: + name: fix_ldap + when: ldap|default(true)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 11ed903e..a44b1a84 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -195,4 +195,11 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool \ No newline at end of file + when: monitoring|default(false)|bool + +- hosts: all + become: true + tasks: + - include_role: + name: fix_ldap + when: ldap|default(true)|bool \ No newline at end of file diff --git a/playbooks/roles/fix_ldap/tasks/main.yml b/playbooks/roles/fix_ldap/tasks/main.yml new file mode 100644 index 00000000..cd6ba1b8 --- /dev/null +++ b/playbooks/roles/fix_ldap/tasks/main.yml @@ -0,0 +1,2 @@ +- include: ubuntu.yml + when: ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/roles/fix_ldap/tasks/ubuntu.yml b/playbooks/roles/fix_ldap/tasks/ubuntu.yml new file mode 100644 index 00000000..5efee0fb --- /dev/null +++ b/playbooks/roles/fix_ldap/tasks/ubuntu.yml @@ -0,0 +1,16 @@ +--- +- name: restart nscd + become: true + systemd: + name: nscd + state: restarted + daemon_reload: true + enabled: true + +- name: restart sssd + become: true + service: + name: sssd + state: restarted + daemon_reload: true + enabled: true \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index abee7284..6c03c760 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -302,3 +302,10 @@ - include_role: name: slurm when: slurm|default(false)|bool + +- hosts: all + become: true + tasks: + - include_role: + name: fix_ldap + when: ldap|default(true)|bool From 6228aec0b360f53f1fe252f7b6898f122dd64ac1 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 4 May 2023 23:29:35 -0600 Subject: [PATCH 31/41] Add option to mount NVMe's as a Logical volume --- autoscaling/tf_init/bastion_update.tf | 2 + autoscaling/tf_init/inventory.tpl | 2 + bastion.tf | 4 ++ conf/variables.tpl | 2 + inventory.tpl | 2 + playbooks/roles/localdisk/tasks/common.yml | 84 +++++++++++++++++++--- schema.yaml | 23 ++++++ slurm_ha.tf | 4 ++ variables.tf | 3 +- 9 files changed, 114 insertions(+), 12 deletions(-) diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 316af6ff..58904dbd 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -43,6 +43,8 @@ resource "local_file" "inventory" { nfs_source_path = var.nfs_source_path, nfs_options = var.nfs_options, localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, cluster_nfs_path = var.cluster_nfs_path, scratch_nfs_path = var.scratch_nfs_path, cluster_network = var.cluster_network, diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 9d2c062d..4b7811ba 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -55,6 +55,8 @@ nfs_source_IP=${nfs_source_IP} nfs_source_path=${nfs_source_path} nfs_options=${nfs_options} localdisk=${localdisk} +redundancy=${redundancy} +log_vol=${log_vol} ldap=${ldap} queue=${queue} instance_type=${instance_type} diff --git a/bastion.tf b/bastion.tf index 94c0a687..6483e03f 100644 --- a/bastion.tf +++ b/bastion.tf @@ -207,6 +207,8 @@ resource "null_resource" "cluster" { nfs_source_path = var.nfs_source_path, nfs_options = var.nfs_options, localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, cluster_network = var.cluster_network, slurm = var.slurm, rack_aware = var.rack_aware, @@ -362,6 +364,8 @@ resource "null_resource" "cluster" { nfs_source_path = var.nfs_source_path, nfs_options = var.nfs_options, localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, monitoring = var.monitoring, hyperthreading = var.hyperthreading, unsupported = var.unsupported, diff --git a/conf/variables.tpl b/conf/variables.tpl index 9a100245..a851d54c 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -126,5 +126,7 @@ variable "bastion_username" { default = "${bastion_username}" } variable "compute_username" { default = "${compute_username}" } variable "localdisk" { default = "${localdisk}" } +variable "log_vol" { default = "${log_vol}" } +variable "redundancy" { default = "${redundancy}" } variable "instance_pool_ocpus_denseIO_flex" { default = "##OCPU##"} diff --git a/inventory.tpl b/inventory.tpl index 3134339a..f60a0aef 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -47,6 +47,8 @@ nfs_source_IP=${nfs_source_IP} nfs_source_path=${nfs_source_path} nfs_options=${nfs_options} localdisk=${localdisk} +redundancy=${redundancy} +log_vol=${log_vol} instance_pool_ocpus=${instance_pool_ocpus} queue=${queue} monitoring=${monitoring} diff --git a/playbooks/roles/localdisk/tasks/common.yml b/playbooks/roles/localdisk/tasks/common.yml index b18269cc..414aae73 100755 --- a/playbooks/roles/localdisk/tasks/common.yml +++ b/playbooks/roles/localdisk/tasks/common.yml @@ -1,32 +1,94 @@ --- +- name: check path + set_fact: + nvme_path_edited: "{% if nvme_path[-1] == '/' %}{{nvme_path[:-1]}}{% else%}{{nvme_path}}{% endif %}" + +- name: Get the number of NVMe's + set_fact: + nvme_count: "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list | length}}" + +- name: Create a LVM? + set_fact: + one_lv: "{{( log_vol | bool ) and ( ( nvme_count | int ) > 1 )}}" + - name: Create a new primary partition parted: - device: /dev/nvme0n1 + device: "/dev/{{item}}" number: 1 state: present label: gpt - when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + with_items: + - "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}" + - name: create a filesystem filesystem: - dev: /dev/nvme0n1p1 + dev: "/dev/{{item}}p1" fstype: xfs - opts: -L localscratch - when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + opts: "-L locscratch{{item | replace('nvme','') | replace('n1','')}}" + with_items: + - "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}" + when: not ( one_lv | bool ) + - name: Mount local volume mount: - path: "{{ nvme_path }}" - src: LABEL=localscratch + path: "{% if item | replace('nvme','') | replace('n1','') == '0' %}{{ nvme_path_edited}}{% else%}{{ nvme_path_edited}}{{item | replace('nvme','') | replace('n1','')}}{% endif %}" + src: "LABEL=locscratch{{item | replace('nvme','') | replace('n1','')}}" fstype: xfs opts: defaults,noatime state: mounted - when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" -- name: "set permissions on {{ nvme_path }}" + with_items: + - "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}" + when: not ( one_lv | bool ) + +- name: "set permissions on {{ nvme_path_edited }}" become: true file: - path: "{{ nvme_path }}" + path: "{% if item | replace('nvme','') | replace('n1','') == '0' %}{{ nvme_path_edited}}{% else%}{{ nvme_path_edited}}{{item | replace('nvme','') | replace('n1','')}}{% endif %}" state: directory owner: "{{ ansible_user }}" mode: 0775 group: "{{privilege_group_name}}" recurse: no - when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + with_items: + - "{{ hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list }}" + when: not ( one_lv | bool ) + +- name: Create volume group + lvg: + vg: "vg_nvmes" + pvs: "{{['/dev/']|product(hostvars[inventory_hostname]['ansible_devices'] | select('match','nvme[0-9]n1') | list)|map('join', '') | join(',')}}" + when: one_lv | bool + +- name: Create Logical volume + lvol: + vg: "vg_nvmes" + lv: "lv_nvmes" + size: 100%FREE + opts: "{% if redundancy | bool %}--type raid10{% else%}-i4{% endif %}" + when: one_lv | bool + +- name: Create file system + filesystem: + fstype: xfs + dev: "/dev/vg_nvmes/lv_nvmes" + when: one_lv | bool + +- name: Mount local volume + mount: + path: "{{ nvme_path_edited}}" + src: "/dev/vg_nvmes/lv_nvmes" + fstype: xfs + opts: defaults,noatime + state: mounted + when: one_lv | bool + +- name: "set permissions on {{ nvme_path_edited }}" + become: true + file: + path: "{{ nvme_path_edited}}" + state: directory + owner: "{{ ansible_user }}" + mode: 0775 + group: "{{privilege_group_name}}" + recurse: no + when: one_lv | bool \ No newline at end of file diff --git a/schema.yaml b/schema.yaml index 1b22dc25..a13ae931 100755 --- a/schema.yaml +++ b/schema.yaml @@ -138,6 +138,8 @@ variableGroups: - ${cluster_block_volume_size} - ${cluster_block_volume_performance} - ${localdisk} + - ${log_vol} + - ${redundancy} - title: "Network options" variables: - ${use_existing_vcn} @@ -1142,6 +1144,27 @@ variables: description: "For nodes using a NVMe, mount the localdisk" visible: ${use_advanced} + log_vol: + type: boolean + title: "One Logical Volume" + default: true + description: "Mount all NVMe's as one logical volume" + visible: + and: + - ${use_advanced} + - ${localdisk} + + redundancy: + type: boolean + title: "Redundancy" + default: true + description: "Use RAID for redundancy" + visible: + and: + - ${use_advanced} + - ${localdisk} + - ${log_vol} + nfs_target_path: type: string title: "NFS Path" diff --git a/slurm_ha.tf b/slurm_ha.tf index dfa9b507..ee137421 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -218,6 +218,8 @@ resource "null_resource" "cluster_backup" { nfs_source_path = var.nfs_source_path, nfs_options = var.nfs_options, localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, cluster_network = var.cluster_network, slurm = var.slurm, slurm_nfs_path = var.slurm_nfs ? var.nfs_source_path : var.cluster_nfs_path, @@ -373,6 +375,8 @@ resource "null_resource" "cluster_backup" { nfs_source_path = var.nfs_source_path, nfs_options = var.nfs_options, localdisk = var.localdisk, + log_vol = var.log_vol, + redundancy = var.redundancy, monitoring = var.monitoring, hyperthreading = var.hyperthreading, unsupported = var.unsupported, diff --git a/variables.tf b/variables.tf index 8c3862ab..e0c893fd 100755 --- a/variables.tf +++ b/variables.tf @@ -237,7 +237,8 @@ variable cluster_nfs_export {default = ""} variable "private_deployment" { default = false } variable "localdisk" { default = true } - +variable "log_vol" { default = false } +variable "redundancy" { default = true } variable "use_marketplace_image_login" { default = true} variable "use_old_marketplace_image_login" { default = false} From 3bc0160b5fe6b77d154c84e15fa0c30f8c93e65e Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Mon, 8 May 2023 17:23:56 -0600 Subject: [PATCH 32/41] Change mpivars.sh to work if no mpi is present --- playbooks/roles/mpivars/tasks/ubuntu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/playbooks/roles/mpivars/tasks/ubuntu.yml b/playbooks/roles/mpivars/tasks/ubuntu.yml index bc013438..e87ab94c 100644 --- a/playbooks/roles/mpivars/tasks/ubuntu.yml +++ b/playbooks/roles/mpivars/tasks/ubuntu.yml @@ -11,7 +11,7 @@ stat: path: /usr/mpi/gcc/{{ openmpi.stdout_lines[0] }}/bin/mpivars.sh register: mpivars - when: openmpi.stdout_lines[0] != "" + when: openmpi.stdout_lines | length > 0 - name: Create mpivars.sh @@ -22,4 +22,4 @@ force: yes owner: root group: root - when: openmpi.stdout_lines[0] != "" and not mpivars.stat.exists \ No newline at end of file + when: openmpi.stdout_lines | length > 0 and not mpivars.stat.exists \ No newline at end of file From ae7609f71f1e54309800244a887ca94e2cb1416a Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 9 May 2023 10:10:22 -0600 Subject: [PATCH 33/41] Fix issue with IPV6 on backup and login nodes --- playbooks/roles/etc-hosts/tasks/common.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml index 4a128bc0..cd39a6c8 100644 --- a/playbooks/roles/etc-hosts/tasks/common.yml +++ b/playbooks/roles/etc-hosts/tasks/common.yml @@ -60,6 +60,14 @@ force: yes when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) +- name: Make sure the IP for each node was not left over in another cluster + become: true + lineinfile: + dest: /etc/hosts + regexp: "^127.0.1.1\\s{{hostvars[groups['bastion'][0]]['inventory_hostname']}}.*" + state: absent + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) + - name: move /etc/hosts on all compute nodes become: true copy: From 98985c048ee6ece20f0ccdb5e9919fb12598d591 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Thu, 11 May 2023 11:14:40 -0600 Subject: [PATCH 34/41] Fix Cores for Std3.64, Std2.52 and E2.64 --- playbooks/roles/slurm/templates/slurm.conf.j2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index de07cc18..10561d69 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -120,13 +120,13 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar {% elif "VM.Standard2." in instance.shape %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance.shape.split('.')[-1]|int }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard2.52" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=52 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=26 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard3.64" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif "VM.Standard.E2." in instance.shape %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{ instance.shape.split('.')[-1]|int }} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard.E2.64" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard.A1.160" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=80 ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% endif %} From 7ddcd4e7506ab699a98287524177b29d98de5884 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Thu, 11 May 2023 15:18:01 -0700 Subject: [PATCH 35/41] setting permission for nfs folders --- playbooks/roles/nfs-client/tasks/debian.yml | 9 +++++++++ playbooks/roles/nfs-client/tasks/el.yml | 9 +++++++++ playbooks/roles/nfs-client/tasks/ubuntu.yml | 9 +++++++++ 3 files changed, 27 insertions(+) diff --git a/playbooks/roles/nfs-client/tasks/debian.yml b/playbooks/roles/nfs-client/tasks/debian.yml index 6d6a84f7..632f2d21 100644 --- a/playbooks/roles/nfs-client/tasks/debian.yml +++ b/playbooks/roles/nfs-client/tasks/debian.yml @@ -28,3 +28,12 @@ fstype: nfs state: mounted when: options=="" + +- name: make sure the permissions of the share directory are right + become: true + file: + path: "{{ local_path }}" + state: directory + owner: debian + group: "{{privilege_group_name}}" + mode: 0775 \ No newline at end of file diff --git a/playbooks/roles/nfs-client/tasks/el.yml b/playbooks/roles/nfs-client/tasks/el.yml index 944d9fc2..6f48bb29 100755 --- a/playbooks/roles/nfs-client/tasks/el.yml +++ b/playbooks/roles/nfs-client/tasks/el.yml @@ -29,3 +29,12 @@ fstype: nfs state: mounted when: options=="" + +- name: make sure the permissions of the share directory are right + become: true + file: + path: "{{ local_path }}" + state: directory + owner: opc + group: "{{privilege_group_name}}" + mode: 0775 \ No newline at end of file diff --git a/playbooks/roles/nfs-client/tasks/ubuntu.yml b/playbooks/roles/nfs-client/tasks/ubuntu.yml index e512a800..1c4f9cb9 100644 --- a/playbooks/roles/nfs-client/tasks/ubuntu.yml +++ b/playbooks/roles/nfs-client/tasks/ubuntu.yml @@ -28,3 +28,12 @@ fstype: nfs state: mounted when: options=="" + +- name: make sure the permissions of the share directory are right + become: true + file: + path: "{{ local_path }}" + state: directory + owner: ubuntu + group: "{{privilege_group_name}}" + mode: 0775 \ No newline at end of file From 9face3ede35ab459dc5228bcad277631e6a6bb01 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Tue, 16 May 2023 11:24:17 -0700 Subject: [PATCH 36/41] run fix ldap before slurm --- playbooks/new_nodes.yml | 19 +++++++++---------- playbooks/resize_add.yml | 19 +++++++++---------- playbooks/site.yml | 14 +++++++------- 3 files changed, 25 insertions(+), 27 deletions(-) diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index 45c59c57..b971b9d5 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -168,13 +168,19 @@ when: enroot|default(true)|bool - include_role: name: tuned - - hosts: compute tasks: - include_role: name: latency_check - when: cluster_network|bool and not 'GPU' in shape + when: cluster_network|bool and not 'GPU' in shape + +- hosts: all + become: true + tasks: + - include_role: + name: fix_ldap + when: ldap|default(true)|bool - hosts: compute, slurm_backup vars: @@ -196,11 +202,4 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool - -- hosts: all - become: true - tasks: - - include_role: - name: fix_ldap - when: ldap|default(true)|bool \ No newline at end of file + when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 48547a68..c0288eb7 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -171,12 +171,18 @@ - include_role: name: tuned - - hosts: compute_to_add tasks: - include_role: name: latency_check - when: cluster_network|bool and not 'GPU' in shape + when: cluster_network|bool and not 'GPU' in shape + +- hosts: all + become: true + tasks: + - include_role: + name: fix_ldap + when: ldap|default(true)|bool - hosts: compute_to_add vars: @@ -198,11 +204,4 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool - -- hosts: all - become: true - tasks: - - include_role: - name: fix_ldap - when: ldap|default(true)|bool \ No newline at end of file + when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/site.yml b/playbooks/site.yml index aa8f1910..5cc30597 100644 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -286,6 +286,13 @@ - include_role: name: tuned +- hosts: all + become: true + tasks: + - include_role: + name: fix_ldap + when: ldap|default(true)|bool + - hosts: all vars: destroy: false @@ -305,10 +312,3 @@ - include_role: name: slurm when: slurm|default(false)|bool - -- hosts: all - become: true - tasks: - - include_role: - name: fix_ldap - when: ldap|default(true)|bool From b151f37b95d8f64e282c6840599d4e46d3f0a015 Mon Sep 17 00:00:00 2001 From: Anoop Nair Date: Mon, 22 May 2023 00:40:34 +0530 Subject: [PATCH 37/41] ubuntu hyperthreading changes --- .../files/control_hyperthreading_ubuntu.sh | 35 +++++++++++++++++++ .../disable-hyperthreading_ubuntu.service | 12 +++++++ 2 files changed, 47 insertions(+) create mode 100644 playbooks/roles/hyperthreading/files/control_hyperthreading_ubuntu.sh create mode 100644 playbooks/roles/hyperthreading/files/disable-hyperthreading_ubuntu.service diff --git a/playbooks/roles/hyperthreading/files/control_hyperthreading_ubuntu.sh b/playbooks/roles/hyperthreading/files/control_hyperthreading_ubuntu.sh new file mode 100644 index 00000000..d2fc6e75 --- /dev/null +++ b/playbooks/roles/hyperthreading/files/control_hyperthreading_ubuntu.sh @@ -0,0 +1,35 @@ +#!/bin/bash +if [ `id -u` -ne 0 ] +then + echo $0: you need to be root + exit 1 +fi +disable_ht() { + echo -n $0: disabling + echo off | sudo tee /sys/devices/system/cpu/smt/control +} + +enable_ht() { + echo -n $0: enabling + echo on | sudo tee /sys/devices/system/cpu/smt/control +} + +case "$1" in +"1"|"on") + enable_ht + ;; +"0"|"off") + disable_ht + ;; +"show") + ;; +*) + echo $0: wrong argument "$1" + exit 2 + ;; +esac + +echo '' +lscpu | egrep "On-line|Off-line" + +exit 0 \ No newline at end of file diff --git a/playbooks/roles/hyperthreading/files/disable-hyperthreading_ubuntu.service b/playbooks/roles/hyperthreading/files/disable-hyperthreading_ubuntu.service new file mode 100644 index 00000000..b89f2a1a --- /dev/null +++ b/playbooks/roles/hyperthreading/files/disable-hyperthreading_ubuntu.service @@ -0,0 +1,12 @@ +[Unit] +Description=Start this service to disable Hyperthreading, stop it to enable Hyperthreading. +After=syslog.target irqbalance.service + +[Service] +Type=oneshot +RemainAfterExit=true +ExecStart=/opt/oci-hpc/sbin/control_hyperthreading_ubuntu.sh off +ExecStop=/opt/oci-hpc/sbin/control_hyperthreading_ubuntu.sh on + +[Install] +WantedBy=multi-user.target From 2bb69ab434e6d6e7b76c7dcd5d3315bf0685d74e Mon Sep 17 00:00:00 2001 From: Anoop Nair Date: Mon, 22 May 2023 00:41:53 +0530 Subject: [PATCH 38/41] ubuntu hyperthreading updates --- playbooks/roles/hyperthreading/tasks/main.yml | 3 ++ .../roles/hyperthreading/tasks/ubuntu.yml | 37 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 playbooks/roles/hyperthreading/tasks/ubuntu.yml diff --git a/playbooks/roles/hyperthreading/tasks/main.yml b/playbooks/roles/hyperthreading/tasks/main.yml index e6a3fa1b..5c0a2160 100644 --- a/playbooks/roles/hyperthreading/tasks/main.yml +++ b/playbooks/roles/hyperthreading/tasks/main.yml @@ -1,2 +1,5 @@ - include: el.yml when: ansible_os_family == 'RedHat' + +- include: ubuntu.yml + when: ansible_distribution == 'Ubuntu' diff --git a/playbooks/roles/hyperthreading/tasks/ubuntu.yml b/playbooks/roles/hyperthreading/tasks/ubuntu.yml new file mode 100644 index 00000000..c5a44cf8 --- /dev/null +++ b/playbooks/roles/hyperthreading/tasks/ubuntu.yml @@ -0,0 +1,37 @@ +--- +- name: Make sure directory exist + become: true + file: + path: /opt/oci-hpc/sbin + state: directory + mode: '0755' + +- name: Copy script + become: true + copy: + src: control_hyperthreading_ubuntu.sh + dest: /opt/oci-hpc/sbin/control_hyperthreading_ubuntu.sh + mode: '0755' + +- name: Copy service unit + become: true + copy: + src: disable-hyperthreading_ubuntu.service + dest: /etc/systemd/system/disable-hyperthreading_ubuntu.service + +- name: Create a unit file + become: true + copy: + src: disable-hyperthreading_ubuntu.service + dest: /etc/systemd/system/disable-hyperthreading.service + +- name: Force systemd to reread configs + ansible.builtin.systemd: + daemon_reload: yes + +- name: + ansible.builtin.systemd: + name: disable-hyperthreading_ubuntu.service + state: started + enabled: yes + when: not hyperthreading|default(true)|bool \ No newline at end of file From 8f5379d054c52ad1a2a33061c5e0b2366e209264 Mon Sep 17 00:00:00 2001 From: Dhvani Sheth Date: Mon, 22 May 2023 16:46:02 -0700 Subject: [PATCH 39/41] update nccl test location --- samples/gpu/nccl_run_allreduce.sbatch | 4 +--- samples/gpu/nccl_run_allreduce.sh | 10 ++++++++-- ...ccl_run_allreduce_containers_with_ordering.sbatch | 3 +-- samples/gpu/nccl_run_allreduce_srun.sbatch | 4 +--- samples/gpu/nccl_run_allreduce_srun.sh | 4 +--- samples/gpu/nccl_run_alltoall.sh | 9 +++++++-- samples/gpu/qfabv1_nccl_run_allreduce.sbatch | 12 +++++++----- samples/gpu/qfabv1_nccl_run_allreduce.sh | 9 +++++++-- samples/gpu/qfabv1_nccl_run_alltoall.sh | 9 +++++++-- 9 files changed, 40 insertions(+), 24 deletions(-) diff --git a/samples/gpu/nccl_run_allreduce.sbatch b/samples/gpu/nccl_run_allreduce.sbatch index bbcfa484..4bcdd290 100644 --- a/samples/gpu/nccl_run_allreduce.sbatch +++ b/samples/gpu/nccl_run_allreduce.sbatch @@ -22,10 +22,8 @@ cat $MACHINEFILE source /etc/os-release if [ $ID == "ol" ] || [ $ID == "centos" ] ; then python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null - USER=opc elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null - USER=ubuntu fi @@ -74,6 +72,6 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 diff --git a/samples/gpu/nccl_run_allreduce.sh b/samples/gpu/nccl_run_allreduce.sh index fd2ae7fc..850a7900 100644 --- a/samples/gpu/nccl_run_allreduce.sh +++ b/samples/gpu/nccl_run_allreduce.sh @@ -18,7 +18,13 @@ echo INPUTFILE cat $hostfile # will generate rack-aware ordered host file -python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null +fi + hostfile=$ORDEREDMACHINEFILE echo ORDEREDMACHINEFILE @@ -73,7 +79,7 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $np --hostfile $hostfile -N 8 /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile + --np $np --hostfile $hostfile -N 8 /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n $iter >> $logfile tail -n 32 $logfile diff --git a/samples/gpu/nccl_run_allreduce_containers_with_ordering.sbatch b/samples/gpu/nccl_run_allreduce_containers_with_ordering.sbatch index eff071f0..fd7abb73 100644 --- a/samples/gpu/nccl_run_allreduce_containers_with_ordering.sbatch +++ b/samples/gpu/nccl_run_allreduce_containers_with_ordering.sbatch @@ -23,10 +23,8 @@ cat $MACHINEFILE source /etc/os-release if [ $ID == "ol" ] || [ $ID == "centos" ] ; then python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null - USER=opc elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null - USER=ubuntu fi echo ORDEREDMACHINEFILE @@ -80,6 +78,7 @@ export RX_QUEUE_LEN=8192 \ NCCL_IB_QPS_PER_CONNECTION=4 env | grep "SLURMD_NODENAME=" +USER=`whoami` CONTAINER_IMAGE="/nfs/scratch/nvcr.io+nvidia+pytorch+22.12-py3.sqsh" CONTAINER_MOUNTS="/home/$USER/nccl-tests:/nccl,$LOCAL_MPI:$LOCAL_MPI" diff --git a/samples/gpu/nccl_run_allreduce_srun.sbatch b/samples/gpu/nccl_run_allreduce_srun.sbatch index 6f98f949..f7ff5b04 100644 --- a/samples/gpu/nccl_run_allreduce_srun.sbatch +++ b/samples/gpu/nccl_run_allreduce_srun.sbatch @@ -29,8 +29,6 @@ if [[ "$mpivars_path" == "" ]]; then source $mpivars_path echo $mpivars_path -USER=`whoami` - shape=`curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq .shape` if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] then @@ -57,4 +55,4 @@ export NCCL_DEBUG=WARN \ NCCL_IB_GID_INDEX=3 \ NCCL_ALGO=Ring \ NCCL_IB_HCA="${var_NCCL_IB_HCA}" - srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 + srun --mpi=pmix_v3 --gpus-per-node=$SLURM_GPUS_PER_NODE --ntasks-per-node=$SLURM_NTASKS_PER_NODE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 diff --git a/samples/gpu/nccl_run_allreduce_srun.sh b/samples/gpu/nccl_run_allreduce_srun.sh index 01cfb047..117cae98 100644 --- a/samples/gpu/nccl_run_allreduce_srun.sh +++ b/samples/gpu/nccl_run_allreduce_srun.sh @@ -43,8 +43,6 @@ do source $mpivars_path echo $mpivars_path - USER=`whoami` - first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] @@ -72,7 +70,7 @@ do NCCL_IB_GID_INDEX=3 \ NCCL_ALGO=Ring \ NCCL_IB_HCA="${var_NCCL_IB_HCA}" - srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile + srun --mpi=pmix_v3 --nodefile=$hostfile --gpus-per-node=8 --ntasks-per-node=8 /opt/oci-hpc/nccl-test/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 >> $logfile diff --git a/samples/gpu/nccl_run_alltoall.sh b/samples/gpu/nccl_run_alltoall.sh index e1be500d..23a37cbf 100644 --- a/samples/gpu/nccl_run_alltoall.sh +++ b/samples/gpu/nccl_run_alltoall.sh @@ -22,7 +22,12 @@ echo INPUTFILE cat $hostfile # will generate rack-aware ordered host file -python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null +fi + hostfile=$ORDEREDMACHINEFILE echo ORDEREDMACHINEFILE @@ -82,7 +87,7 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $np --hostfile $hostfile -N 8 /home/opc/nccl-tests/build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile + --np $np --hostfile $hostfile -N 8 /opt/oci-hpc/nccl-test/build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile tail -n 15 $logfile diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch index 203f3ba6..b78684a2 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch @@ -19,7 +19,12 @@ scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE echo MACHINEFILE cat $MACHINEFILE -python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +fi echo ORDEREDMACHINEFILE cat $ORDEREDMACHINEFILE @@ -31,9 +36,6 @@ source $mpivars_path if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi -#source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh -#source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - export NCCL_DEBUG=WARN @@ -64,6 +66,6 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/opc/nccl-tests/build/all_reduce_perf -b8 -e 4G -f 2 -n 100 + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /opt/oci-hpc/nccl-test/build/all_reduce_perf -b8 -e 4G -f 2 -n 100 diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sh b/samples/gpu/qfabv1_nccl_run_allreduce.sh index f5c5cad6..28b3afdb 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sh +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sh @@ -18,7 +18,12 @@ echo INPUTFILE cat $hostfile # will generate rack-aware ordered host file -python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null +fi + hostfile=$ORDEREDMACHINEFILE echo ORDEREDMACHINEFILE @@ -75,7 +80,7 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $np --hostfile $hostfile -N 8 /home/opc/nccl-tests/build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile + --np $np --hostfile $hostfile -N 8 /opt/oci-hpc/nccl-test/build/all_reduce_perf -b8 -e 4G -f 2 -n $iter >> $logfile tail -n 32 $logfile diff --git a/samples/gpu/qfabv1_nccl_run_alltoall.sh b/samples/gpu/qfabv1_nccl_run_alltoall.sh index a9d217d8..dd7975f4 100644 --- a/samples/gpu/qfabv1_nccl_run_alltoall.sh +++ b/samples/gpu/qfabv1_nccl_run_alltoall.sh @@ -24,7 +24,12 @@ echo INPUTFILE cat $hostfile # will generate rack-aware ordered host file -python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $hostfile > /dev/null +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $hostfile > /dev/null +fi + hostfile=$ORDEREDMACHINEFILE echo ORDEREDMACHINEFILE @@ -87,7 +92,7 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $np --hostfile $hostfile -N 8 /home/opc/nccl-tests/build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile + --np $np --hostfile $hostfile -N 8 /opt/oci-hpc/nccl-test/build/alltoall_perf -f 2 -g 1 -c 0 -n $iter >> $logfile tail -n 15 $logfile From 85de5961d5df3590277e389aa17119917290b078 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 23 May 2023 11:52:42 -0600 Subject: [PATCH 40/41] Update with new images --- conf/variables.tpl | 6 +++--- variables.tf | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/conf/variables.tpl b/conf/variables.tpl index a851d54c..bffb4b65 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -49,12 +49,12 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0" + "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0" "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0" + "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" } } diff --git a/variables.tf b/variables.tf index e0c893fd..28a615a0 100755 --- a/variables.tf +++ b/variables.tf @@ -86,12 +86,12 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.05.18-0" + "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.05.18-0" "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0" + "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.05.18-0" } } From 0cfde2a9e72fb0999b1655bbb30575492a323064 Mon Sep 17 00:00:00 2001 From: arnaudfroidmont Date: Tue, 23 May 2023 15:28:37 -0600 Subject: [PATCH 41/41] Add comment for oci cli --- bin/bastion.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/bastion.sh b/bin/bastion.sh index 8d6a83f7..e8cf6966 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -93,7 +93,7 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then pip install pip --upgrade pip install pyopenssl --upgrade - # install oci-cli + # install oci-cli (add --oci-cli-version 3.23.3 or version that you know works if the latest does not work ) bash -c "$(curl -L https://raw.githubusercontent.com/oracle/oci-cli/master/scripts/install/install.sh)" -s --accept-all-defaults # install oci module