|
| 1 | +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"). |
| 4 | +# You may not use this file except in compliance with the License. |
| 5 | +# A copy of the License is located at |
| 6 | +# |
| 7 | +# http://aws.amazon.com/apache2.0/ |
| 8 | +# |
| 9 | +# or in the "LICENSE.txt" file accompanying this file. |
| 10 | +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. |
| 11 | +# See the License for the specific language governing permissions and limitations under the License. |
| 12 | +import logging |
| 13 | + |
| 14 | +import boto3 |
| 15 | +import pytest |
| 16 | +from assertpy import assert_that |
| 17 | +from remote_command_executor import RemoteCommandExecutor |
| 18 | + |
| 19 | +from tests.common.schedulers_common import SlurmCommands |
| 20 | + |
| 21 | + |
| 22 | +@pytest.mark.parametrize("scale_up_fleet", [False]) |
| 23 | +@pytest.mark.usefixtures("region", "os", "instance", "scheduler") |
| 24 | +def test_pyxis(pcluster_config_reader, clusters_factory, test_datadir, s3_bucket_factory, region, scale_up_fleet): |
| 25 | + """ |
| 26 | + Test Pyxis and Enroot functionality after configuration. |
| 27 | +
|
| 28 | +
|
| 29 | + This test creates a cluster with the necessary custom actions to configure Pyxis and Enroot. |
| 30 | + It submits two consecutive containerized jobs and verifies that they run successfully, |
| 31 | + and the output contains the expected messages. |
| 32 | + """ |
| 33 | + # Set max_queue_size based on scale_up_fleet |
| 34 | + max_queue_size = 1000 if scale_up_fleet else 3 |
| 35 | + |
| 36 | + # Create an S3 bucket for custom action scripts |
| 37 | + bucket_name = s3_bucket_factory() |
| 38 | + bucket = boto3.resource("s3", region_name=region).Bucket(bucket_name) |
| 39 | + |
| 40 | + # Pre-upload custom scripts that set up pyxis to S3 |
| 41 | + bucket.upload_file(str(test_datadir / "head_node_configure.sh"), "head_node_configure.sh") |
| 42 | + bucket.upload_file(str(test_datadir / "compute_node_start.sh"), "compute_node_start.sh") |
| 43 | + |
| 44 | + cluster_config = pcluster_config_reader(bucket_name=bucket_name, max_queue_size=max_queue_size) |
| 45 | + cluster = clusters_factory(cluster_config) |
| 46 | + |
| 47 | + remote_command_executor = RemoteCommandExecutor(cluster) |
| 48 | + slurm_commands = SlurmCommands(remote_command_executor) |
| 49 | + |
| 50 | + # Submit the first containerized job with dynamic 3 or 1000 nodes |
| 51 | + logging.info("Submitting first containerized job") |
| 52 | + |
| 53 | + result = slurm_commands.submit_command( |
| 54 | + command="srun --container-image docker://ubuntu:22.04 hostname", |
| 55 | + nodes=max_queue_size, |
| 56 | + ) |
| 57 | + job_id = slurm_commands.assert_job_submitted(result.stdout) |
| 58 | + slurm_commands.wait_job_completed(job_id, timeout=30 if scale_up_fleet else 12) |
| 59 | + slurm_commands.assert_job_succeeded(job_id) |
| 60 | + |
| 61 | + # Fetch the job output and check for the expected messages |
| 62 | + logging.info("Checking output of the first job") |
| 63 | + slurm_out_1 = remote_command_executor.run_remote_command("cat slurm-1.out").stdout |
| 64 | + |
| 65 | + logging.info("Checking for expected messages in first job output") |
| 66 | + assert_that(slurm_out_1).contains("pyxis: imported docker image: docker://ubuntu:22.04") |
| 67 | + |
| 68 | + # Submit the second containerized job with fixed 3 nodes after the first one completes |
| 69 | + logging.info("Submitting second containerized job") |
| 70 | + result = slurm_commands.submit_command( |
| 71 | + command="srun --container-image docker://ubuntu:22.04 hostname", |
| 72 | + nodes=3, |
| 73 | + ) |
| 74 | + job_id = slurm_commands.assert_job_submitted(result.stdout) |
| 75 | + slurm_commands.wait_job_completed(job_id) |
| 76 | + slurm_commands.assert_job_succeeded(job_id) |
| 77 | + |
| 78 | + # Fetch the job output and check for the expected messages |
| 79 | + logging.info("Checking output of the second job") |
| 80 | + slurm_out_2 = remote_command_executor.run_remote_command("cat slurm-2.out").stdout |
| 81 | + |
| 82 | + logging.info("Checking for expected messages in second job output") |
| 83 | + assert_that(slurm_out_2).contains("pyxis: imported docker image: docker://ubuntu:22.04") |
0 commit comments