diff --git a/.aws/terraform-jupyterhub-provisioning-policies.json b/.aws/terraform-jupyterhub-provisioning-policies.json
index e0a6cafb..35103551 100644
--- a/.aws/terraform-jupyterhub-provisioning-policies.json
+++ b/.aws/terraform-jupyterhub-provisioning-policies.json
@@ -4,69 +4,7 @@
     {
       "Effect": "Allow",
       "Action": [
-        "ec2:AllocateAddress",
-        "ec2:AssociateAddress",
-        "ec2:AssociateRouteTable",
-        "ec2:AssociateVpcCidrBlock",
-        "ec2:AttachInternetGateway",
-        "ec2:AttachNetworkInterface",
-        "ec2:AuthorizeSecurityGroupEgress",
-        "ec2:AuthorizeSecurityGroupIngress",
-        "ec2:CreateInternetGateway",
-        "ec2:CreateLaunchTemplate",
-        "ec2:CreateLaunchTemplateVersion",
-        "ec2:CreateNatGateway",
-        "ec2:CreateNetworkAcl",
-        "ec2:CreateNetworkAclEntry",
-        "ec2:CreateNetworkInterface",
-        "ec2:CreateNetworkInterfacePermission",
-        "ec2:CreateRoute",
-        "ec2:CreateRouteTable",
-        "ec2:CreateSecurityGroup",
-        "ec2:CreateSubnet",
-        "ec2:CreateTags",
-        "ec2:CreateVpc",
-        "ec2:DeleteInternetGateway",
-        "ec2:DeleteLaunchTemplate",
-        "ec2:DeleteLaunchTemplateVersions",
-        "ec2:DeleteNatGateway",
-        "ec2:DeleteNetworkAcl",
-        "ec2:DeleteNetworkAclEntry",
-        "ec2:DeleteNetworkInterface",
-        "ec2:DeleteRoute",
-        "ec2:DeleteRouteTable",
-        "ec2:DeleteSecurityGroup",
-        "ec2:DeleteSubnet",
-        "ec2:DeleteTags",
-        "ec2:DeleteVpc",
-        "ec2:DescribeAddresses",
-        "ec2:DescribeAddressesAttribute",
-        "ec2:DescribeAvailabilityZones",
-        "ec2:DescribeInternetGateways",
-        "ec2:DescribeLaunchTemplateVersions",
-        "ec2:DescribeLaunchTemplates",
-        "ec2:DescribeNatGateways",
-        "ec2:DescribeNetworkAcls",
-        "ec2:DescribeNetworkInterfacePermissions",
-        "ec2:DescribeNetworkInterfaces",
-        "ec2:DescribeRouteTables",
-        "ec2:DescribeSecurityGroupRules",
-        "ec2:DescribeSecurityGroups",
-        "ec2:DescribeSubnets",
-        "ec2:DescribeVpcAttribute",
-        "ec2:DescribeVpcs",
-        "ec2:DetachInternetGateway",
-        "ec2:DetachNetworkInterface",
-        "ec2:DisassociateAddress",
-        "ec2:DisassociateRouteTable",
-        "ec2:DisassociateVpcCidrBlock",
-        "ec2:ModifyNetworkInterfaceAttribute",
-        "ec2:ModifyVpcAttribute",
-        "ec2:ReleaseAddress",
-        "ec2:ReplaceRoute",
-        "ec2:RevokeSecurityGroupEgress",
-        "ec2:RevokeSecurityGroupIngress",
-        "ec2:RunInstances",
+        "ec2:*",
         "ecr-public:GetAuthorizationToken",
         "eks:*",
         "elasticfilesystem:CreateFileSystem",
diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
new file mode 100755
index 00000000..c523123a
--- /dev/null
+++ b/.github/scripts/calculate-directory-stats.py
@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+
+import os
+import csv
+import json
+import sys
+import unittest
+from collections import defaultdict
+from pathlib import Path
+from pprint import pprint
+from typing import Iterable
+
+
+def propagate_dir(stats, current_parent, previous_parent):
+    assert os.path.isabs(current_parent) == os.path.isabs(
+        previous_parent
+    ), "current_parent and previous_parent must both be abspath or both be relpath"
+    highest_common = os.path.commonpath([current_parent, previous_parent])
+    assert highest_common, "highest_common must either be a target directory or /"
+
+    path_to_propagate = os.path.relpath(previous_parent, highest_common)
+    # leaves off last to avoid propagating to the path we are propagating from
+    nested_dir_list = path_to_propagate.split(os.sep)[:-1]
+    # Add each dir count to all ancestors up to highest common dir
+    while nested_dir_list:
+        working_dir = os.path.join(highest_common, *nested_dir_list)
+        stats[working_dir]["file_count"] += stats[previous_parent]["file_count"]
+        stats[working_dir]["total_size"] += stats[previous_parent]["total_size"]
+        nested_dir_list.pop()
+        previous_parent = working_dir
+    stats[highest_common]["file_count"] += stats[previous_parent]["file_count"]
+    stats[highest_common]["total_size"] += stats[previous_parent]["total_size"]
+
+
+def generate_directory_statistics(data: Iterable[str]):
+    # Assumes dirs are listed depth first (files are listed prior to directories)
+
+    stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+    previous_parent = ""
+    for filepath, size, modified, created, error in data:
+        # TODO if error is not None:
+        this_parent = os.path.dirname(filepath)
+        stats[this_parent]["file_count"] += 1
+        stats[this_parent]["total_size"] += int(size)
+
+        if previous_parent == this_parent:
+            continue
+        # going deeper
+        elif not previous_parent or previous_parent == os.path.dirname(this_parent):
+            previous_parent = this_parent
+            continue
+        else:  # previous dir done
+            propagate_dir(stats, this_parent, previous_parent)
+            previous_parent = this_parent
+
+    # Run a final time with the root directory as this parent
+    # During final run, leading dir cannot be empty string, propagate_dir requires
+    # both to be abspath or both to be relpath
+    leading_dir = previous_parent.split(os.sep)[0] or "/"
+    propagate_dir(stats, leading_dir, previous_parent)
+    return stats
+
+
+def iter_file_metadata(file_path):
+    """
+    Reads a tsv and returns an iterable that yields one row of file metadata at
+    a time, excluding comments.
+    """
+    file_path = Path(file_path)
+    with file_path.open(mode="r", newline="", encoding="utf-8") as file:
+        reader = csv.reader(file, delimiter="\t")
+        for row in reader:
+            # Skip empty lines or lines starting with '#'
+            if not row or row[0].startswith("#"):
+                continue
+            yield row
+
+def update_stats(stats, directory, stat):
+    stats["total_size"] += stat["total_size"]
+    stats["file_count"] += stat["file_count"]
+
+    # Caches track directories, but not report as a whole
+    if stats.get("directories") is not None:
+        stats["directories"].append(directory)
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <input_json_file>")
+        sys.exit(1)
+
+    input_tsv_file = sys.argv[1]
+    username = input_tsv_file.split("-index.tsv")[0]
+
+    data = iter_file_metadata(input_tsv_file)
+    stats = generate_directory_statistics(data)
+    cache_types = ["pycache", "user_cache", "yarn_cache", "pip_cache", "nwb_cache"]
+    report_stats = {
+        "total_size": 0,
+        "file_count": 0,
+        "caches": {
+            cache_type: {"total_size": 0, "file_count": 0, "directories": []}
+            for cache_type in cache_types
+        }
+    }
+
+    # print(f"{directory}: File count: {stat['file_count']}, Total Size: {stat['total_size']}")
+    for directory, stat in stats.items():
+        if directory.endswith("__pycache__"):
+            update_stats(report_stats["caches"]["pycache"], directory, stat)
+        elif directory.endswith(f"{username}/.cache"):
+            update_stats(report_stats["caches"]["user_cache"], directory, stat)
+        elif directory.endswith(".cache/yarn"):
+            update_stats(report_stats["caches"]["yarn_cache"], directory, stat)
+        elif directory.endswith(".cache/pip"):
+            update_stats(report_stats["caches"]["pip_cache"], directory, stat)
+        elif directory == username:
+            update_stats(report_stats, username, stat)
+
+    OUTPUT_DIR = "/home/austin/hub-user-reports/"
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    with open(f"{OUTPUT_DIR}{username}-report.json", "w") as out:
+        json.dump(report_stats, out)
+
+
+    sorted_dirs = sorted(stats.items(), key=lambda x: x[1]['total_size'], reverse=True)
+    print(f"Finished {username} with Total {report_stats["total_size"]}")
+
+
+class TestDirectoryStatistics(unittest.TestCase):
+    def test_propagate_dir(self):
+        stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+        stats["a/b/c"] = {"total_size": 100, "file_count": 3}
+        stats["a/b"] = {"total_size": 10, "file_count": 0}
+        stats["a"] = {"total_size": 1, "file_count": 0}
+
+        propagate_dir(stats, "a", "a/b/c")
+        self.assertEqual(stats["a"]["file_count"], 3)
+        self.assertEqual(stats["a/b"]["file_count"], 3)
+        self.assertEqual(stats["a"]["total_size"], 111)
+
+    def test_propagate_dir_abs_path(self):
+        stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+        stats["/a/b/c"] = {"total_size": 0, "file_count": 3}
+        stats["/a/b"] = {"total_size": 0, "file_count": 0}
+        stats["/a"] = {"total_size": 0, "file_count": 0}
+
+        propagate_dir(stats, "/a", "/a/b/c")
+        self.assertEqual(stats["/a"]["file_count"], 3)
+        self.assertEqual(stats["/a/b"]["file_count"], 3)
+
+    def test_propagate_dir_files_in_all(self):
+        stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+        stats["a/b/c"] = {"total_size": 0, "file_count": 3}
+        stats["a/b"] = {"total_size": 0, "file_count": 2}
+        stats["a"] = {"total_size": 0, "file_count": 1}
+
+        propagate_dir(stats, "a", "a/b/c")
+        self.assertEqual(stats["a"]["file_count"], 6)
+        self.assertEqual(stats["a/b"]["file_count"], 5)
+
+    def test_generate_directory_statistics(self):
+        sample_data = [
+            ("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02", "OK"),
+            ("a/b/c/file1.txt", 1234, "2024-12-01", "2024-12-02", "OK"),
+            ("a/b/c/file2.txt", 2345, "2024-12-01", "2024-12-02", "OK"),
+            ("a/b/c/d/file4.txt", 4567, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/file3.txt", 5678, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/f/file1.txt", 6789, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/f/file2.txt", 7890, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/f/g/file4.txt", 8901, "2024-12-01", "2024-12-02", "OK"),
+        ]
+        stats = generate_directory_statistics(sample_data)
+        self.assertEqual(stats["a/b/c/d"]["file_count"], 1)
+        self.assertEqual(stats["a/b/c"]["file_count"], 3)
+        self.assertEqual(stats["a/b"]["file_count"], 4)
+        self.assertEqual(stats["a/e/f/g"]["file_count"], 1)
+        self.assertEqual(stats["a/e/f"]["file_count"], 3)
+        self.assertEqual(stats["a/e"]["file_count"], 4)
+        self.assertEqual(stats["a"]["file_count"], 8)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        unittest.main(
+            argv=sys.argv[:1]
+        )  # Run tests if "test" is provided as an argument
+    else:
+        try:
+            main()
+        except Exception as e:
+            # print(f"FAILED ------------------------------ {sys.argv[1]}")
+            # raise(e)
+            pass
diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
new file mode 100755
index 00000000..a790d927
--- /dev/null
+++ b/.github/scripts/cleanup-ec2.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Load environment variables from the file if they are not already set
+ENV_FILE=".ec2-session.env"
+if [ -f "$ENV_FILE" ]; then
+  echo "Loading environment variables from $ENV_FILE..."
+  source "$ENV_FILE"
+else
+  echo "Warning: Environment file $ENV_FILE not found."
+fi
+
+# Ensure required environment variables are set
+if [ -z "$INSTANCE_ID" ]; then
+  echo "Error: INSTANCE_ID is not set. Cannot proceed with cleanup."
+  exit 1
+fi
+
+if [ -z "$ALLOC_ID" ]; then
+  echo "Error: ALLOC_ID is not set. Cannot proceed with cleanup."
+  exit 1
+fi
+
+# Check for AWS CLI and credentials
+if ! command -v aws &>/dev/null; then
+  echo "Error: AWS CLI is not installed. Please install it and configure your credentials."
+  exit 1
+fi
+
+if ! aws sts get-caller-identity &>/dev/null; then
+  echo "Error: Unable to access AWS. Ensure your credentials are configured correctly."
+  exit 1
+fi
+
+# Terminate EC2 instance
+echo "Terminating EC2 instance with ID: $INSTANCE_ID..."
+if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --no-cli-pager; then
+  echo "Instance termination initiated. Waiting for the instance to terminate..."
+  if aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID"; then
+    echo "Instance $INSTANCE_ID has been successfully terminated."
+  else
+    echo "Warning: Instance $INSTANCE_ID may not have terminated correctly."
+  fi
+else
+  echo "Warning: Failed to terminate instance $INSTANCE_ID. It may already be terminated."
+fi
+
+# Release Elastic IP
+echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID..."
+if aws ec2 release-address --allocation-id "$ALLOC_ID"; then
+  echo "Elastic IP with Allocation ID $ALLOC_ID has been successfully released."
+else
+  echo "Warning: Failed to release Elastic IP with Allocation ID $ALLOC_ID. It may already be released."
+fi
+
+# Cleanup environment file
+if [ -f "$ENV_FILE" ]; then
+  echo "Removing environment file $ENV_FILE..."
+  rm -f "$ENV_FILE"
+fi
+
+echo "Cleanup complete."
diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
new file mode 100755
index 00000000..5f8cb661
--- /dev/null
+++ b/.github/scripts/create-file-index.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+import os
+import csv
+import time
+import sys
+from datetime import datetime
+from pathlib import Path
+
+OUTPUT_DIR = "/tmp/hub-user-indexes"
+
+class MetadataWriter:
+    def __init__(self, output_path):
+        self.output_path = Path(output_path)
+        self.start_time = None
+        self.end_time = None
+        self.meta = {
+            "index_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "duration": None,
+            "total_files": 0,
+            "total_size": 0,
+        }
+        self.file = None
+        self.writer = None
+
+    def start(self):
+        """Initialize the metadata and open the file for writing."""
+        self.start_time = time.time()
+        self.file = self.output_path.open(mode="w", newline="", encoding="utf-8")
+        self.writer = csv.writer(self.file, delimiter="\t")
+        self.writer.writerow(["#file_name", "file_size", "file_type", "custom_metadata"])
+
+    def write_row(self, file_name, file_size, created, modified, error):
+        """Write data for a file."""
+        if not self.writer:
+            raise RuntimeError("Writer not initialized.")
+        if error is not None:
+            self.writer.writerow([file_name, "-", "-", "-", error])
+        else:
+            self.writer.writerow([file_name, file_size, created, modified, "OK"])
+
+        self.meta["total_files"] += 1
+        self.meta["total_size"] += file_size
+
+    def finish(self):
+        """Finalize metadata, write it to the file, and close the file."""
+        if not self.writer:
+            raise RuntimeError("Writer not initialized.")
+        self.end_time = time.time()
+        self.meta["duration"] = self.end_time - self.start_time
+
+        self.file.write("\n# Execution Metadata\n")
+        for key, value in self.meta.items():
+            self.file.write(f"# {key}: {value}\n")
+
+        self.file.close()
+        print(f"Directory {self.output_path} complete, Duration: {self.meta['duration']:.2f}, Total Files: {self.meta['total_files']}, Total Size: {self.meta['total_size']}")
+
+    def get_meta(self):
+        """Return the meta-metadata dictionary."""
+        return self.meta
+
+
+def directory_index(directory):
+    for root, dirs, files in os.walk(directory):
+        for name in files:
+            filepath = os.path.join(root, name)
+            try:
+                stat_result = os.stat(filepath, follow_symlinks=False)
+            except (FileNotFoundError, PermissionError) as e:
+                size = modified = created = None
+                error = str(e)
+            else: 
+                size = stat_result.st_size
+                modified = time.ctime(stat_result.st_mtime)
+                created = time.ctime(stat_result.st_ctime)
+                error = None
+            yield filepath, size, modified, created, error
+
+# Ensure the script is called with the required arguments
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <directory_to_index>")
+        sys.exit(1)
+
+    directory = sys.argv[1]
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    output_file = f"{OUTPUT_DIR}/{directory}-index.tsv"
+
+    file_index = MetadataWriter(output_file)
+    file_index.start()
+
+    for filename, size, created, modified, error in directory_index(directory):
+        file_index.write_row(filename, size, created, modified, error)
+
+    file_index.finish()
diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
new file mode 100755
index 00000000..0c8ab676
--- /dev/null
+++ b/.github/scripts/launch-ec2.sh
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+
+set -e
+
+# Check for AWS CLI and credentials
+if ! command -v aws &>/dev/null; then
+  echo "Error: AWS CLI is not installed. Please install it and configure your credentials."
+  exit 1
+fi
+
+if ! aws sts get-caller-identity &>/dev/null; then
+  echo "Error: Unable to access AWS. Ensure your credentials are configured correctly."
+  exit 1
+fi
+
+# Set variables
+AWS_REGION="us-east-2"
+# TODO document that this key needs to be created
+KEY_NAME="dandihub-gh-actions"
+# TODO create if DNE
+# allow gh-actions to ssh into ec2 job instance from anywhere
+SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
+# TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a)
+SUBNET_ID="subnet-0f544cca61ccd2804"
+AMI_ID="ami-0c80e2b6ccb9ad6d1"
+EFS_ID="fs-02aac16c4c6c2dc27"
+LOCAL_SCRIPTS_DIR=".github/scripts"
+REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
+MOUNT_POINT="/mnt/efs"
+ENV_FILE=".ec2-session.env"
+
+# Ensure the environment file is writable
+echo "# Environment variables for EC2 session" > $ENV_FILE
+echo "# Auto-generated by launch script on $(date)" >> $ENV_FILE
+
+# Run EC2 instance
+echo "Launching EC2 instance..."
+export INSTANCE_ID=$(aws ec2 run-instances \
+  --image-id $AMI_ID \
+  --count 1 \
+  --instance-type t3.micro \
+  --key-name $KEY_NAME \
+  --security-group-ids $SECURITY_GROUP_ID \
+  --subnet-id $SUBNET_ID \
+  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
+  --query 'Instances[0].InstanceId' \
+  --output text)
+
+if [ -z "$INSTANCE_ID" ]; then
+  echo "Error: Failed to launch EC2 instance."
+  exit 1
+fi
+echo "Instance ID: $INSTANCE_ID"
+echo "export INSTANCE_ID=$INSTANCE_ID" >> $ENV_FILE
+
+# Wait for instance to initialize
+echo "Waiting for instance to reach status OK..."
+aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID"
+
+# Allocate Elastic IP
+echo "Allocating Elastic IP..."
+export ALLOC_ID=$(aws ec2 allocate-address \
+  --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=dandihub-gh-actions-eip}]" \
+  --query 'AllocationId' \
+  --output text)
+
+if [ -z "$ALLOC_ID" ]; then
+  echo "Error: Failed to allocate Elastic IP."
+  exit 1
+fi
+echo "Elastic IP Allocation ID: $ALLOC_ID"
+echo "export ALLOC_ID=$ALLOC_ID" >> $ENV_FILE
+
+# Associate Elastic IP with instance
+echo "Associating Elastic IP with instance..."
+export EIP_ASSOC=$(aws ec2 associate-address \
+  --instance-id "$INSTANCE_ID" \
+  --allocation-id "$ALLOC_ID" \
+  --query 'AssociationId' \
+  --output text)
+
+if [ -z "$EIP_ASSOC" ]; then
+  echo "Error: Failed to associate Elastic IP."
+  exit 1
+fi
+
+# Get Elastic IP address
+export PUBLIC_IP=$(aws ec2 describe-addresses \
+  --allocation-ids "$ALLOC_ID" \
+  --query 'Addresses[0].PublicIp' \
+  --output text)
+
+echo "Elastic IP Address: $PUBLIC_IP"
+echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
+
+# Upload scripts to EC2 instance
+echo "Uploading scripts to EC2 instance..."
+scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
+  $LOCAL_SCRIPTS_DIR/calculate-directory-stats.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
+  ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
+
+if [ $? -eq 0 ]; then
+  echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance."
+else
+  echo "Error: Failed to upload scripts to the instance."
+  exit 1
+fi
+
+# TODO automate
+# eks-dandihub-efs sg is created by dandi-hub install
+# this sg needs to accept incoming 2049 from the sg created for this ec2
+# sg-061d875722e569724 - eks-dandihub-efs
+# aws ec2 authorize-security-group-ingress \
+#   --group-id sg-061d875722e569724 \
+#   --protocol tcp \
+#   --port 2049 \
+#   --source-group $SECURITY_GROUP_ID
+
+echo "Installing dependencies ..."
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+  "sudo yum install -y amazon-efs-utils pip parallel && \
+  pip install con-duct"
+
+# Mount EFS on the EC2 instance
+echo "Mounting EFS on the EC2 instance..."
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+   "sudo mkdir -p $MOUNT_POINT && \
+   sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \
+   echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \
+   echo 'EFS mounted at $MOUNT_POINT'"
+
+# Output SSH command for convenience
+echo "To connect to your instance, use:"
+echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"
+
+echo "Environment variables saved to $ENV_FILE."
+echo "Run 'source $ENV_FILE' to restore the environment variables."
diff --git a/NEXTSTEPS b/NEXTSTEPS
new file mode 100644
index 00000000..44d01774
--- /dev/null
+++ b/NEXTSTEPS
@@ -0,0 +1,32 @@
+DONE
+  - Set AWS_ROLE ARN secret
+  - AWS_ACCESS_KEY_ID
+  - AWS_SECRET_ACCESS_KEY
+
+TODO:
+  - Create Dockerhub Service account
+    - set username & token as secrets
+  - Create Github CI account
+  - Docker Image Tagging:
+    - The Docker image is tagged with latest. For better version control, consider using commit SHA or version numbers.
+  - Log Retrieval:
+    - The logs from the pod are retrieved to help you verify the script's output.
+  - Cleanup:
+    - Deleting the Job ensures that no resources are left running after the workflow completes.
+
+By making these updates, your workflow will now:
+
+    Include your du.py script in a Docker image.
+    Build and push this image to DockerHub.
+    Deploy a Kubernetes Job to your EKS cluster that runs the script.
+    Wait for the Job to complete and retrieve logs.
+    Clean up resources after execution.
+
+Feel free to ask if you need further assistance or clarification on any of these steps!
+
+
+- Get image pushing
+- create private gh repository under dandi org for reports
+
+
+
diff --git a/images/Dockerfile.dandihub_report_generator b/images/Dockerfile.dandihub_report_generator
new file mode 100644
index 00000000..5f460084
--- /dev/null
+++ b/images/Dockerfile.dandihub_report_generator
@@ -0,0 +1,15 @@
+FROM python:3.9-slim
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the du.py script into the container
+COPY .github/scripts/du.py /app/du.py
+
+# Install required packages
+RUN apt-get update \
+    && apt-get install -y coreutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set the entrypoint to the script
+ENTRYPOINT ["python3", "/app/du.py"]