diff --git a/.aws/terraform-jupyterhub-provisioning-policies.json b/.aws/terraform-jupyterhub-provisioning-policies.json index e0a6cafb..35103551 100644 --- a/.aws/terraform-jupyterhub-provisioning-policies.json +++ b/.aws/terraform-jupyterhub-provisioning-policies.json @@ -4,69 +4,7 @@ { "Effect": "Allow", "Action": [ - "ec2:AllocateAddress", - "ec2:AssociateAddress", - "ec2:AssociateRouteTable", - "ec2:AssociateVpcCidrBlock", - "ec2:AttachInternetGateway", - "ec2:AttachNetworkInterface", - "ec2:AuthorizeSecurityGroupEgress", - "ec2:AuthorizeSecurityGroupIngress", - "ec2:CreateInternetGateway", - "ec2:CreateLaunchTemplate", - "ec2:CreateLaunchTemplateVersion", - "ec2:CreateNatGateway", - "ec2:CreateNetworkAcl", - "ec2:CreateNetworkAclEntry", - "ec2:CreateNetworkInterface", - "ec2:CreateNetworkInterfacePermission", - "ec2:CreateRoute", - "ec2:CreateRouteTable", - "ec2:CreateSecurityGroup", - "ec2:CreateSubnet", - "ec2:CreateTags", - "ec2:CreateVpc", - "ec2:DeleteInternetGateway", - "ec2:DeleteLaunchTemplate", - "ec2:DeleteLaunchTemplateVersions", - "ec2:DeleteNatGateway", - "ec2:DeleteNetworkAcl", - "ec2:DeleteNetworkAclEntry", - "ec2:DeleteNetworkInterface", - "ec2:DeleteRoute", - "ec2:DeleteRouteTable", - "ec2:DeleteSecurityGroup", - "ec2:DeleteSubnet", - "ec2:DeleteTags", - "ec2:DeleteVpc", - "ec2:DescribeAddresses", - "ec2:DescribeAddressesAttribute", - "ec2:DescribeAvailabilityZones", - "ec2:DescribeInternetGateways", - "ec2:DescribeLaunchTemplateVersions", - "ec2:DescribeLaunchTemplates", - "ec2:DescribeNatGateways", - "ec2:DescribeNetworkAcls", - "ec2:DescribeNetworkInterfacePermissions", - "ec2:DescribeNetworkInterfaces", - "ec2:DescribeRouteTables", - "ec2:DescribeSecurityGroupRules", - "ec2:DescribeSecurityGroups", - "ec2:DescribeSubnets", - "ec2:DescribeVpcAttribute", - "ec2:DescribeVpcs", - "ec2:DetachInternetGateway", - "ec2:DetachNetworkInterface", - "ec2:DisassociateAddress", - "ec2:DisassociateRouteTable", - "ec2:DisassociateVpcCidrBlock", - "ec2:ModifyNetworkInterfaceAttribute", - "ec2:ModifyVpcAttribute", - "ec2:ReleaseAddress", - "ec2:ReplaceRoute", - "ec2:RevokeSecurityGroupEgress", - "ec2:RevokeSecurityGroupIngress", - "ec2:RunInstances", + "ec2:*", "ecr-public:GetAuthorizationToken", "eks:*", "elasticfilesystem:CreateFileSystem", diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py new file mode 100755 index 00000000..c523123a --- /dev/null +++ b/.github/scripts/calculate-directory-stats.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 + +import os +import csv +import json +import sys +import unittest +from collections import defaultdict +from pathlib import Path +from pprint import pprint +from typing import Iterable + + +def propagate_dir(stats, current_parent, previous_parent): + assert os.path.isabs(current_parent) == os.path.isabs( + previous_parent + ), "current_parent and previous_parent must both be abspath or both be relpath" + highest_common = os.path.commonpath([current_parent, previous_parent]) + assert highest_common, "highest_common must either be a target directory or /" + + path_to_propagate = os.path.relpath(previous_parent, highest_common) + # leaves off last to avoid propagating to the path we are propagating from + nested_dir_list = path_to_propagate.split(os.sep)[:-1] + # Add each dir count to all ancestors up to highest common dir + while nested_dir_list: + working_dir = os.path.join(highest_common, *nested_dir_list) + stats[working_dir]["file_count"] += stats[previous_parent]["file_count"] + stats[working_dir]["total_size"] += stats[previous_parent]["total_size"] + nested_dir_list.pop() + previous_parent = working_dir + stats[highest_common]["file_count"] += stats[previous_parent]["file_count"] + stats[highest_common]["total_size"] += stats[previous_parent]["total_size"] + + +def generate_directory_statistics(data: Iterable[str]): + # Assumes dirs are listed depth first (files are listed prior to directories) + + stats = defaultdict(lambda: {"total_size": 0, "file_count": 0}) + previous_parent = "" + for filepath, size, modified, created, error in data: + # TODO if error is not None: + this_parent = os.path.dirname(filepath) + stats[this_parent]["file_count"] += 1 + stats[this_parent]["total_size"] += int(size) + + if previous_parent == this_parent: + continue + # going deeper + elif not previous_parent or previous_parent == os.path.dirname(this_parent): + previous_parent = this_parent + continue + else: # previous dir done + propagate_dir(stats, this_parent, previous_parent) + previous_parent = this_parent + + # Run a final time with the root directory as this parent + # During final run, leading dir cannot be empty string, propagate_dir requires + # both to be abspath or both to be relpath + leading_dir = previous_parent.split(os.sep)[0] or "/" + propagate_dir(stats, leading_dir, previous_parent) + return stats + + +def iter_file_metadata(file_path): + """ + Reads a tsv and returns an iterable that yields one row of file metadata at + a time, excluding comments. + """ + file_path = Path(file_path) + with file_path.open(mode="r", newline="", encoding="utf-8") as file: + reader = csv.reader(file, delimiter="\t") + for row in reader: + # Skip empty lines or lines starting with '#' + if not row or row[0].startswith("#"): + continue + yield row + +def update_stats(stats, directory, stat): + stats["total_size"] += stat["total_size"] + stats["file_count"] += stat["file_count"] + + # Caches track directories, but not report as a whole + if stats.get("directories") is not None: + stats["directories"].append(directory) + +def main(): + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + input_tsv_file = sys.argv[1] + username = input_tsv_file.split("-index.tsv")[0] + + data = iter_file_metadata(input_tsv_file) + stats = generate_directory_statistics(data) + cache_types = ["pycache", "user_cache", "yarn_cache", "pip_cache", "nwb_cache"] + report_stats = { + "total_size": 0, + "file_count": 0, + "caches": { + cache_type: {"total_size": 0, "file_count": 0, "directories": []} + for cache_type in cache_types + } + } + + # print(f"{directory}: File count: {stat['file_count']}, Total Size: {stat['total_size']}") + for directory, stat in stats.items(): + if directory.endswith("__pycache__"): + update_stats(report_stats["caches"]["pycache"], directory, stat) + elif directory.endswith(f"{username}/.cache"): + update_stats(report_stats["caches"]["user_cache"], directory, stat) + elif directory.endswith(".cache/yarn"): + update_stats(report_stats["caches"]["yarn_cache"], directory, stat) + elif directory.endswith(".cache/pip"): + update_stats(report_stats["caches"]["pip_cache"], directory, stat) + elif directory == username: + update_stats(report_stats, username, stat) + + OUTPUT_DIR = "/home/austin/hub-user-reports/" + os.makedirs(OUTPUT_DIR, exist_ok=True) + with open(f"{OUTPUT_DIR}{username}-report.json", "w") as out: + json.dump(report_stats, out) + + + sorted_dirs = sorted(stats.items(), key=lambda x: x[1]['total_size'], reverse=True) + print(f"Finished {username} with Total {report_stats["total_size"]}") + + +class TestDirectoryStatistics(unittest.TestCase): + def test_propagate_dir(self): + stats = defaultdict(lambda: {"total_size": 0, "file_count": 0}) + stats["a/b/c"] = {"total_size": 100, "file_count": 3} + stats["a/b"] = {"total_size": 10, "file_count": 0} + stats["a"] = {"total_size": 1, "file_count": 0} + + propagate_dir(stats, "a", "a/b/c") + self.assertEqual(stats["a"]["file_count"], 3) + self.assertEqual(stats["a/b"]["file_count"], 3) + self.assertEqual(stats["a"]["total_size"], 111) + + def test_propagate_dir_abs_path(self): + stats = defaultdict(lambda: {"total_size": 0, "file_count": 0}) + stats["/a/b/c"] = {"total_size": 0, "file_count": 3} + stats["/a/b"] = {"total_size": 0, "file_count": 0} + stats["/a"] = {"total_size": 0, "file_count": 0} + + propagate_dir(stats, "/a", "/a/b/c") + self.assertEqual(stats["/a"]["file_count"], 3) + self.assertEqual(stats["/a/b"]["file_count"], 3) + + def test_propagate_dir_files_in_all(self): + stats = defaultdict(lambda: {"total_size": 0, "file_count": 0}) + stats["a/b/c"] = {"total_size": 0, "file_count": 3} + stats["a/b"] = {"total_size": 0, "file_count": 2} + stats["a"] = {"total_size": 0, "file_count": 1} + + propagate_dir(stats, "a", "a/b/c") + self.assertEqual(stats["a"]["file_count"], 6) + self.assertEqual(stats["a/b"]["file_count"], 5) + + def test_generate_directory_statistics(self): + sample_data = [ + ("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02", "OK"), + ("a/b/c/file1.txt", 1234, "2024-12-01", "2024-12-02", "OK"), + ("a/b/c/file2.txt", 2345, "2024-12-01", "2024-12-02", "OK"), + ("a/b/c/d/file4.txt", 4567, "2024-12-01", "2024-12-02", "OK"), + ("a/e/file3.txt", 5678, "2024-12-01", "2024-12-02", "OK"), + ("a/e/f/file1.txt", 6789, "2024-12-01", "2024-12-02", "OK"), + ("a/e/f/file2.txt", 7890, "2024-12-01", "2024-12-02", "OK"), + ("a/e/f/g/file4.txt", 8901, "2024-12-01", "2024-12-02", "OK"), + ] + stats = generate_directory_statistics(sample_data) + self.assertEqual(stats["a/b/c/d"]["file_count"], 1) + self.assertEqual(stats["a/b/c"]["file_count"], 3) + self.assertEqual(stats["a/b"]["file_count"], 4) + self.assertEqual(stats["a/e/f/g"]["file_count"], 1) + self.assertEqual(stats["a/e/f"]["file_count"], 3) + self.assertEqual(stats["a/e"]["file_count"], 4) + self.assertEqual(stats["a"]["file_count"], 8) + + +if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "test": + unittest.main( + argv=sys.argv[:1] + ) # Run tests if "test" is provided as an argument + else: + try: + main() + except Exception as e: + # print(f"FAILED ------------------------------ {sys.argv[1]}") + # raise(e) + pass diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh new file mode 100755 index 00000000..a790d927 --- /dev/null +++ b/.github/scripts/cleanup-ec2.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +set -e + +# Load environment variables from the file if they are not already set +ENV_FILE=".ec2-session.env" +if [ -f "$ENV_FILE" ]; then + echo "Loading environment variables from $ENV_FILE..." + source "$ENV_FILE" +else + echo "Warning: Environment file $ENV_FILE not found." +fi + +# Ensure required environment variables are set +if [ -z "$INSTANCE_ID" ]; then + echo "Error: INSTANCE_ID is not set. Cannot proceed with cleanup." + exit 1 +fi + +if [ -z "$ALLOC_ID" ]; then + echo "Error: ALLOC_ID is not set. Cannot proceed with cleanup." + exit 1 +fi + +# Check for AWS CLI and credentials +if ! command -v aws &>/dev/null; then + echo "Error: AWS CLI is not installed. Please install it and configure your credentials." + exit 1 +fi + +if ! aws sts get-caller-identity &>/dev/null; then + echo "Error: Unable to access AWS. Ensure your credentials are configured correctly." + exit 1 +fi + +# Terminate EC2 instance +echo "Terminating EC2 instance with ID: $INSTANCE_ID..." +if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --no-cli-pager; then + echo "Instance termination initiated. Waiting for the instance to terminate..." + if aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID"; then + echo "Instance $INSTANCE_ID has been successfully terminated." + else + echo "Warning: Instance $INSTANCE_ID may not have terminated correctly." + fi +else + echo "Warning: Failed to terminate instance $INSTANCE_ID. It may already be terminated." +fi + +# Release Elastic IP +echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID..." +if aws ec2 release-address --allocation-id "$ALLOC_ID"; then + echo "Elastic IP with Allocation ID $ALLOC_ID has been successfully released." +else + echo "Warning: Failed to release Elastic IP with Allocation ID $ALLOC_ID. It may already be released." +fi + +# Cleanup environment file +if [ -f "$ENV_FILE" ]; then + echo "Removing environment file $ENV_FILE..." + rm -f "$ENV_FILE" +fi + +echo "Cleanup complete." diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py new file mode 100755 index 00000000..5f8cb661 --- /dev/null +++ b/.github/scripts/create-file-index.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 + +import os +import csv +import time +import sys +from datetime import datetime +from pathlib import Path + +OUTPUT_DIR = "/tmp/hub-user-indexes" + +class MetadataWriter: + def __init__(self, output_path): + self.output_path = Path(output_path) + self.start_time = None + self.end_time = None + self.meta = { + "index_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "duration": None, + "total_files": 0, + "total_size": 0, + } + self.file = None + self.writer = None + + def start(self): + """Initialize the metadata and open the file for writing.""" + self.start_time = time.time() + self.file = self.output_path.open(mode="w", newline="", encoding="utf-8") + self.writer = csv.writer(self.file, delimiter="\t") + self.writer.writerow(["#file_name", "file_size", "file_type", "custom_metadata"]) + + def write_row(self, file_name, file_size, created, modified, error): + """Write data for a file.""" + if not self.writer: + raise RuntimeError("Writer not initialized.") + if error is not None: + self.writer.writerow([file_name, "-", "-", "-", error]) + else: + self.writer.writerow([file_name, file_size, created, modified, "OK"]) + + self.meta["total_files"] += 1 + self.meta["total_size"] += file_size + + def finish(self): + """Finalize metadata, write it to the file, and close the file.""" + if not self.writer: + raise RuntimeError("Writer not initialized.") + self.end_time = time.time() + self.meta["duration"] = self.end_time - self.start_time + + self.file.write("\n# Execution Metadata\n") + for key, value in self.meta.items(): + self.file.write(f"# {key}: {value}\n") + + self.file.close() + print(f"Directory {self.output_path} complete, Duration: {self.meta['duration']:.2f}, Total Files: {self.meta['total_files']}, Total Size: {self.meta['total_size']}") + + def get_meta(self): + """Return the meta-metadata dictionary.""" + return self.meta + + +def directory_index(directory): + for root, dirs, files in os.walk(directory): + for name in files: + filepath = os.path.join(root, name) + try: + stat_result = os.stat(filepath, follow_symlinks=False) + except (FileNotFoundError, PermissionError) as e: + size = modified = created = None + error = str(e) + else: + size = stat_result.st_size + modified = time.ctime(stat_result.st_mtime) + created = time.ctime(stat_result.st_ctime) + error = None + yield filepath, size, modified, created, error + +# Ensure the script is called with the required arguments +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python script.py ") + sys.exit(1) + + directory = sys.argv[1] + + os.makedirs(OUTPUT_DIR, exist_ok=True) + output_file = f"{OUTPUT_DIR}/{directory}-index.tsv" + + file_index = MetadataWriter(output_file) + file_index.start() + + for filename, size, created, modified, error in directory_index(directory): + file_index.write_row(filename, size, created, modified, error) + + file_index.finish() diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh new file mode 100755 index 00000000..0c8ab676 --- /dev/null +++ b/.github/scripts/launch-ec2.sh @@ -0,0 +1,137 @@ +#!/usr/bin/env bash + +set -e + +# Check for AWS CLI and credentials +if ! command -v aws &>/dev/null; then + echo "Error: AWS CLI is not installed. Please install it and configure your credentials." + exit 1 +fi + +if ! aws sts get-caller-identity &>/dev/null; then + echo "Error: Unable to access AWS. Ensure your credentials are configured correctly." + exit 1 +fi + +# Set variables +AWS_REGION="us-east-2" +# TODO document that this key needs to be created +KEY_NAME="dandihub-gh-actions" +# TODO create if DNE +# allow gh-actions to ssh into ec2 job instance from anywhere +SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e" +# TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a) +SUBNET_ID="subnet-0f544cca61ccd2804" +AMI_ID="ami-0c80e2b6ccb9ad6d1" +EFS_ID="fs-02aac16c4c6c2dc27" +LOCAL_SCRIPTS_DIR=".github/scripts" +REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts" +MOUNT_POINT="/mnt/efs" +ENV_FILE=".ec2-session.env" + +# Ensure the environment file is writable +echo "# Environment variables for EC2 session" > $ENV_FILE +echo "# Auto-generated by launch script on $(date)" >> $ENV_FILE + +# Run EC2 instance +echo "Launching EC2 instance..." +export INSTANCE_ID=$(aws ec2 run-instances \ + --image-id $AMI_ID \ + --count 1 \ + --instance-type t3.micro \ + --key-name $KEY_NAME \ + --security-group-ids $SECURITY_GROUP_ID \ + --subnet-id $SUBNET_ID \ + --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \ + --query 'Instances[0].InstanceId' \ + --output text) + +if [ -z "$INSTANCE_ID" ]; then + echo "Error: Failed to launch EC2 instance." + exit 1 +fi +echo "Instance ID: $INSTANCE_ID" +echo "export INSTANCE_ID=$INSTANCE_ID" >> $ENV_FILE + +# Wait for instance to initialize +echo "Waiting for instance to reach status OK..." +aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID" + +# Allocate Elastic IP +echo "Allocating Elastic IP..." +export ALLOC_ID=$(aws ec2 allocate-address \ + --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=dandihub-gh-actions-eip}]" \ + --query 'AllocationId' \ + --output text) + +if [ -z "$ALLOC_ID" ]; then + echo "Error: Failed to allocate Elastic IP." + exit 1 +fi +echo "Elastic IP Allocation ID: $ALLOC_ID" +echo "export ALLOC_ID=$ALLOC_ID" >> $ENV_FILE + +# Associate Elastic IP with instance +echo "Associating Elastic IP with instance..." +export EIP_ASSOC=$(aws ec2 associate-address \ + --instance-id "$INSTANCE_ID" \ + --allocation-id "$ALLOC_ID" \ + --query 'AssociationId' \ + --output text) + +if [ -z "$EIP_ASSOC" ]; then + echo "Error: Failed to associate Elastic IP." + exit 1 +fi + +# Get Elastic IP address +export PUBLIC_IP=$(aws ec2 describe-addresses \ + --allocation-ids "$ALLOC_ID" \ + --query 'Addresses[0].PublicIp' \ + --output text) + +echo "Elastic IP Address: $PUBLIC_IP" +echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE + +# Upload scripts to EC2 instance +echo "Uploading scripts to EC2 instance..." +scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \ + $LOCAL_SCRIPTS_DIR/calculate-directory-stats.py $LOCAL_SCRIPTS_DIR/create-file-index.py \ + ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/" + +if [ $? -eq 0 ]; then + echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance." +else + echo "Error: Failed to upload scripts to the instance." + exit 1 +fi + +# TODO automate +# eks-dandihub-efs sg is created by dandi-hub install +# this sg needs to accept incoming 2049 from the sg created for this ec2 +# sg-061d875722e569724 - eks-dandihub-efs +# aws ec2 authorize-security-group-ingress \ +# --group-id sg-061d875722e569724 \ +# --protocol tcp \ +# --port 2049 \ +# --source-group $SECURITY_GROUP_ID + +echo "Installing dependencies ..." +ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \ + "sudo yum install -y amazon-efs-utils pip parallel && \ + pip install con-duct" + +# Mount EFS on the EC2 instance +echo "Mounting EFS on the EC2 instance..." +ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \ + "sudo mkdir -p $MOUNT_POINT && \ + sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \ + echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \ + echo 'EFS mounted at $MOUNT_POINT'" + +# Output SSH command for convenience +echo "To connect to your instance, use:" +echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP" + +echo "Environment variables saved to $ENV_FILE." +echo "Run 'source $ENV_FILE' to restore the environment variables." diff --git a/NEXTSTEPS b/NEXTSTEPS new file mode 100644 index 00000000..44d01774 --- /dev/null +++ b/NEXTSTEPS @@ -0,0 +1,32 @@ +DONE + - Set AWS_ROLE ARN secret + - AWS_ACCESS_KEY_ID + - AWS_SECRET_ACCESS_KEY + +TODO: + - Create Dockerhub Service account + - set username & token as secrets + - Create Github CI account + - Docker Image Tagging: + - The Docker image is tagged with latest. For better version control, consider using commit SHA or version numbers. + - Log Retrieval: + - The logs from the pod are retrieved to help you verify the script's output. + - Cleanup: + - Deleting the Job ensures that no resources are left running after the workflow completes. + +By making these updates, your workflow will now: + + Include your du.py script in a Docker image. + Build and push this image to DockerHub. + Deploy a Kubernetes Job to your EKS cluster that runs the script. + Wait for the Job to complete and retrieve logs. + Clean up resources after execution. + +Feel free to ask if you need further assistance or clarification on any of these steps! + + +- Get image pushing +- create private gh repository under dandi org for reports + + + diff --git a/images/Dockerfile.dandihub_report_generator b/images/Dockerfile.dandihub_report_generator new file mode 100644 index 00000000..5f460084 --- /dev/null +++ b/images/Dockerfile.dandihub_report_generator @@ -0,0 +1,15 @@ +FROM python:3.9-slim + +# Set the working directory +WORKDIR /app + +# Copy the du.py script into the container +COPY .github/scripts/du.py /app/du.py + +# Install required packages +RUN apt-get update \ + && apt-get install -y coreutils \ + && rm -rf /var/lib/apt/lists/* + +# Set the entrypoint to the script +ENTRYPOINT ["python3", "/app/du.py"]