Skip to content

Commit c877343

Browse files
authored
[Release-3.11] Add pyxis integration test (#6475)
* Add test for Pyxis and Enroot functionality after configuration. * This test creates a cluster with the necessary custom actions to configure Pyxis and Enroot. * It submits two consecutive containerized jobs and verifies that they run successfully.
1 parent ae32369 commit c877343

File tree

6 files changed

+171
-1
lines changed

6 files changed

+171
-1
lines changed

tests/integration-tests/README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ test cases then you can do it in the following way:
518518

519519
```python
520520
@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
521-
@pytest.mark.parametrized("cluster_max_size", [5, 10])
521+
@pytest.mark.parametrize("cluster_max_size", [5, 10])
522522
def test_case_2(cluster_max_size):
523523
```
524524

tests/integration-tests/configs/develop.yaml

+7
Original file line numberDiff line numberDiff line change
@@ -829,3 +829,10 @@ test-suites:
829829
instances: {{ common.INSTANCES_DEFAULT_X86 }}
830830
oss: ["ubuntu2004"]
831831
schedulers: ["slurm"]
832+
pyxis:
833+
test_pyxis.py::test_pyxis:
834+
dimensions:
835+
- regions: ["eu-west-1"]
836+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
837+
oss: ["ubuntu2204"]
838+
schedulers: ["slurm"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License").
4+
# You may not use this file except in compliance with the License.
5+
# A copy of the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "LICENSE.txt" file accompanying this file.
10+
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
11+
# See the License for the specific language governing permissions and limitations under the License.
12+
import logging
13+
14+
import boto3
15+
import pytest
16+
from assertpy import assert_that
17+
from remote_command_executor import RemoteCommandExecutor
18+
19+
from tests.common.schedulers_common import SlurmCommands
20+
21+
22+
@pytest.mark.parametrize("scale_up_fleet", [False])
23+
@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
24+
def test_pyxis(pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory, region, scale_up_fleet):
25+
"""
26+
Test Pyxis and Enroot functionality after configuration.
27+
28+
29+
This test creates a cluster with the necessary custom actions to configure Pyxis and Enroot.
30+
It submits two consecutive containerized jobs and verifies that they run successfully,
31+
and the output contains the expected messages.
32+
"""
33+
# Set max_queue_size based on scale_up_fleet
34+
max_queue_size = 1000 if scale_up_fleet else 3
35+
36+
# Create an S3 bucket for custom action scripts
37+
bucket_name = s3_bucket_factory()
38+
bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name)
39+
40+
# Pre-upload custom scripts that set up pyxis to S3
41+
bucket.upload_file(str(test_datadir / "head_node_configure.sh"), "head_node_configure.sh")
42+
bucket.upload_file(str(test_datadir / "compute_node_start.sh"), "compute_node_start.sh")
43+
44+
cluster_config = pcluster_config_reader(bucket_name=bucket_name, max_queue_size=max_queue_size)
45+
cluster = clusters_factory(cluster_config)
46+
47+
remote_command_executor = RemoteCommandExecutor(cluster)
48+
slurm_commands = SlurmCommands(remote_command_executor)
49+
50+
# Submit the first containerized job with dynamic 3 or 1000 nodes
51+
logging.info("Submitting first containerized job")
52+
53+
result = slurm_commands.submit_command(
54+
command="srun --container-image docker://ubuntu:22.04 hostname",
55+
nodes=max_queue_size,
56+
)
57+
job_id = slurm_commands.assert_job_submitted(result.stdout)
58+
slurm_commands.wait_job_completed(job_id, timeout=30 if scale_up_fleet else 12)
59+
slurm_commands.assert_job_succeeded(job_id)
60+
61+
# Fetch the job output and check for the expected messages
62+
logging.info("Checking output of the first job")
63+
slurm_out_1 = remote_command_executor.run_remote_command("cat slurm-1.out").stdout
64+
65+
logging.info("Checking for expected messages in first job output")
66+
assert_that(slurm_out_1).contains("pyxis: imported docker image: docker://ubuntu:22.04")
67+
68+
# Submit the second containerized job with fixed 3 nodes after the first one completes
69+
logging.info("Submitting second containerized job")
70+
result = slurm_commands.submit_command(
71+
command="srun --container-image docker://ubuntu:22.04 hostname",
72+
nodes=3,
73+
)
74+
job_id = slurm_commands.assert_job_submitted(result.stdout)
75+
slurm_commands.wait_job_completed(job_id)
76+
slurm_commands.assert_job_succeeded(job_id)
77+
78+
# Fetch the job output and check for the expected messages
79+
logging.info("Checking output of the second job")
80+
slurm_out_2 = remote_command_executor.run_remote_command("cat slurm-2.out").stdout
81+
82+
logging.info("Checking for expected messages in second job output")
83+
assert_that(slurm_out_2).contains("pyxis: imported docker image: docker://ubuntu:22.04")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
set -e
3+
4+
echo "Executing $0"
5+
6+
# Configure Enroot
7+
ENROOT_PERSISTENT_DIR="/var/enroot"
8+
ENROOT_VOLATILE_DIR="/run/enroot"
9+
10+
sudo mkdir -p $ENROOT_PERSISTENT_DIR
11+
sudo chmod 1777 $ENROOT_PERSISTENT_DIR
12+
sudo mkdir -p $ENROOT_VOLATILE_DIR
13+
sudo chmod 1777 $ENROOT_VOLATILE_DIR
14+
sudo mv /opt/parallelcluster/examples/enroot/enroot.conf /etc/enroot/enroot.conf
15+
sudo chmod 0644 /etc/enroot/enroot.conf
16+
17+
# Configure Pyxis
18+
PYXIS_RUNTIME_DIR="/run/pyxis"
19+
20+
sudo mkdir -p $PYXIS_RUNTIME_DIR
21+
sudo chmod 1777 $PYXIS_RUNTIME_DIR
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/bash
2+
set -e
3+
4+
echo "Executing $0"
5+
6+
# Configure Enroot
7+
ENROOT_PERSISTENT_DIR="/var/enroot"
8+
ENROOT_VOLATILE_DIR="/run/enroot"
9+
10+
sudo mkdir -p $ENROOT_PERSISTENT_DIR
11+
sudo chmod 1777 $ENROOT_PERSISTENT_DIR
12+
sudo mkdir -p $ENROOT_VOLATILE_DIR
13+
sudo chmod 1777 $ENROOT_VOLATILE_DIR
14+
sudo mv /opt/parallelcluster/examples/enroot/enroot.conf /etc/enroot/enroot.conf
15+
sudo chmod 0644 /etc/enroot/enroot.conf
16+
17+
# Configure Pyxis
18+
PYXIS_RUNTIME_DIR="/run/pyxis"
19+
20+
sudo mkdir -p $PYXIS_RUNTIME_DIR
21+
sudo chmod 1777 $PYXIS_RUNTIME_DIR
22+
23+
sudo mkdir -p /opt/slurm/etc/plugstack.conf.d/
24+
sudo mv /opt/parallelcluster/examples/spank/plugstack.conf /opt/slurm/etc/
25+
sudo mv /opt/parallelcluster/examples/pyxis/pyxis.conf /opt/slurm/etc/plugstack.conf.d/
26+
sudo -i scontrol reconfigure
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
Image:
2+
Os: {{ os }}
3+
HeadNode:
4+
InstanceType: {{ instance }}
5+
Networking:
6+
SubnetId: {{ public_subnet_id }}
7+
Ssh:
8+
KeyName: {{ key_name }}
9+
CustomActions:
10+
OnNodeConfigured:
11+
Script: s3://{{ bucket_name }}/head_node_configure.sh
12+
Iam:
13+
S3Access:
14+
- BucketName: {{ bucket_name }}
15+
Scheduling:
16+
Scheduler: {{ scheduler }}
17+
SlurmQueues:
18+
- Name: queue-0
19+
ComputeResources:
20+
- Name: compute-resource-0
21+
Instances:
22+
- InstanceType: t3.small
23+
MinCount: 0
24+
MaxCount: {{ max_queue_size }}
25+
Networking:
26+
SubnetIds:
27+
- {{ private_subnet_id }}
28+
CustomActions:
29+
OnNodeStart:
30+
Script: s3://{{ bucket_name }}/compute_node_start.sh
31+
Iam:
32+
S3Access:
33+
- BucketName: {{ bucket_name }}

0 commit comments

Comments
 (0)