From 30e9cbf6fef0502e2ef2895c242922d62d60b1fe Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:05:15 -0500
Subject: [PATCH 01/96] Inital commit to add GH action to generate report

---
 .github/workflows/report.yaml | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 .github/workflows/report.yaml

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
new file mode 100644
index 00000000..96758af1
--- /dev/null
+++ b/.github/workflows/report.yaml
@@ -0,0 +1,31 @@
+name: Generate Data Usage Report
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@v3
+      with:
+        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        # TODO param region
+        aws-region: us-east-2
+
+    - name: Configure kubectl with AWS EKS
+      # TODO param name, region
+      run: |
+        aws eks update-kubeconfig --name eks-dandihub --region us-east-2
+
+    - name: Sanity check
+      run: |
+        kubectl get pods -n jupyterhub

From 3bcba913a70c3845675e88525cecd6b828263257 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:13:59 -0500
Subject: [PATCH 02/96] Assume Jupyterhub Provisioning Role

---
 .github/workflows/report.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 96758af1..829dc77c 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -21,6 +21,16 @@ jobs:
         # TODO param region
         aws-region: us-east-2
 
+     - name: Assume JupyterhubProvisioningRole
+       # TODO param ProvisioningRoleARN and name ^
+      run: |
+        ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole"
+        CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession")
+        export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
+        export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
+        export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+
+
     - name: Configure kubectl with AWS EKS
       # TODO param name, region
       run: |

From b5cdcf3d3760a64f84e751d50705248baa23e26b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:16:02 -0500
Subject: [PATCH 03/96] Fixup: indent

---
 .github/workflows/report.yaml | 58 +++++++++++++++++------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 829dc77c..162b09f3 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -10,32 +10,32 @@ jobs:
     runs-on: ubuntu-latest
 
     steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Configure AWS credentials
-      uses: aws-actions/configure-aws-credentials@v3
-      with:
-        aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-        aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-        # TODO param region
-        aws-region: us-east-2
-
-     - name: Assume JupyterhubProvisioningRole
-       # TODO param ProvisioningRoleARN and name ^
-      run: |
-        ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole"
-        CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession")
-        export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
-        export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
-        export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
-
-
-    - name: Configure kubectl with AWS EKS
-      # TODO param name, region
-      run: |
-        aws eks update-kubeconfig --name eks-dandihub --region us-east-2
-
-    - name: Sanity check
-      run: |
-        kubectl get pods -n jupyterhub
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          # TODO param region
+          aws-region: us-east-2
+
+      - name: Assume JupyterhubProvisioningRole
+        # TODO param ProvisioningRoleARN and name ^
+        run: |
+          ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole"
+          CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession")
+          export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
+          export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
+          export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+
+
+      - name: Configure kubectl with AWS EKS
+        # TODO param name, region
+        run: |
+          aws eks update-kubeconfig --name eks-dandihub --region us-east-2
+
+      - name: Sanity check
+        run: |
+          kubectl get pods -n jupyterhub

From 6e118cc6111ede09c6f4073dd66e71daa8734958 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:17:03 -0500
Subject: [PATCH 04/96] Rename job

---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 162b09f3..5b1f0537 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -6,7 +6,7 @@ on:
       - main
 
 jobs:
-  deploy:
+  generate_data_usage_report:
     runs-on: ubuntu-latest
 
     steps:

From 5062f08a6542ef93d4afeaabd4c0018348dd4c12 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:23:29 -0500
Subject: [PATCH 05/96] Add assumed role to update-kubeconfig

---
 .github/workflows/report.yaml | 4 ++--
 README.md                     | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 5b1f0537..82298758 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -32,9 +32,9 @@ jobs:
 
 
       - name: Configure kubectl with AWS EKS
-        # TODO param name, region
+        # TODO param name, region role-arn
         run: |
-          aws eks update-kubeconfig --name eks-dandihub --region us-east-2
+          aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn arn:aws:iam::278212569472:role/JupyterhubProvisioningRole
 
       - name: Sanity check
         run: |
diff --git a/README.md b/README.md
index 5af6edf5..4337f54f 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,9 @@
 # Dandihub
 
+TODO
+ - add provisioning role to cluser :q
+ -
+
 This Terraform blueprint creates a Kubernetes environment (EKS) and installs JupyterHub. Based on [AWS Data on EKS JupyterHub](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/jupyterhub).
 
 ## Table of Contents

From d21a3a9bfb1ebc63b6e587beeb22d1de0643111f Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:26:10 -0500
Subject: [PATCH 06/96] No need to add ProvisioningRole to masters

---
 README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.md b/README.md
index 4337f54f..5af6edf5 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,5 @@
 # Dandihub
 
-TODO
- - add provisioning role to cluser :q
- -
-
 This Terraform blueprint creates a Kubernetes environment (EKS) and installs JupyterHub. Based on [AWS Data on EKS JupyterHub](https://github.com/awslabs/data-on-eks/tree/main/ai-ml/jupyterhub).
 
 ## Table of Contents

From 403028f39e93af601554e71a9b62bd1a51dc68ae Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:47:01 -0500
Subject: [PATCH 07/96] Deploy a pod to the cluster, and schedule with
 Karpenter

---
 .github/manifests/hello-world.yaml | 20 ++++++++++++++++++++
 .github/workflows/report.yaml      | 20 ++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 .github/manifests/hello-world.yaml

diff --git a/.github/manifests/hello-world.yaml b/.github/manifests/hello-world.yaml
new file mode 100644
index 00000000..1977f336
--- /dev/null
+++ b/.github/manifests/hello-world.yaml
@@ -0,0 +1,20 @@
+# manifests/hello-world-pod.yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: hello-world-pod
+spec:
+  containers:
+  - name: hello
+    image: busybox
+    command: ['sh', '-c', 'echo Hello, World! && sleep 30']
+  nodeSelector:
+    NodeGroupType: default
+    NodePool: default
+    hub.jupyter.org/node-purpose: user
+  tolerations:
+  - key: "hub.jupyter.org/dedicated"
+    operator: "Equal"
+    value: "user"
+    effect: "NoSchedule"
+
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 82298758..d8dff659 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -39,3 +39,23 @@ jobs:
       - name: Sanity check
         run: |
           kubectl get pods -n jupyterhub
+          
+      # Step 4: Deploy Hello World Pod from manifest
+      - name: Deploy Hello World Pod
+        run: |
+          kubectl apply -f manifests/hello-world-pod.yaml
+
+      # Step 5: Wait for Pod to Complete
+      - name: Wait for Hello World Pod to complete
+        run: |
+          kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=60s
+
+      # Step 6: Get Pod Logs to verify it ran successfully
+      - name: Get Hello World Pod logs
+        run: |
+          kubectl logs hello-world-pod
+
+      # Step 7: Cleanup - Delete the Pod
+      - name: Delete Hello World Pod
+        run: |
+          kubectl delete pod hello-world-pod

From 92b9925821c2ca04fb3a37257c294efaf3553de4 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:48:20 -0500
Subject: [PATCH 08/96] Fixup: correct path to pod manifest

---
 .github/workflows/report.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index d8dff659..a498d99f 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -39,11 +39,11 @@ jobs:
       - name: Sanity check
         run: |
           kubectl get pods -n jupyterhub
-          
+
       # Step 4: Deploy Hello World Pod from manifest
       - name: Deploy Hello World Pod
         run: |
-          kubectl apply -f manifests/hello-world-pod.yaml
+          kubectl apply -f .github/manifests/hello-world-pod.yaml
 
       # Step 5: Wait for Pod to Complete
       - name: Wait for Hello World Pod to complete

From 478a31f1d96d11e0c2809fe1df42d3e89b7a35d2 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:49:42 -0500
Subject: [PATCH 09/96] Fixup again ugh, rename file

---
 .github/manifests/{hello-world.yaml => hello-world-pod.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/manifests/{hello-world.yaml => hello-world-pod.yaml} (100%)

diff --git a/.github/manifests/hello-world.yaml b/.github/manifests/hello-world-pod.yaml
similarity index 100%
rename from .github/manifests/hello-world.yaml
rename to .github/manifests/hello-world-pod.yaml

From 9db914e57232a5be6b11e6fd743726aaada42682 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 25 Sep 2024 15:59:09 -0500
Subject: [PATCH 10/96] Delete Pod even if previous step times out

(Also increase timeout)
---
 .github/workflows/report.yaml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index a498d99f..bc1d0882 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -48,14 +48,17 @@ jobs:
       # Step 5: Wait for Pod to Complete
       - name: Wait for Hello World Pod to complete
         run: |
-          kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=60s
+          kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
+        continue-on-error: true  # Allow the workflow to continue even if this step fails
 
-      # Step 6: Get Pod Logs to verify it ran successfully
+      # Step 6: Get Pod Logs to verify it ran successfully, only if Step 5 succeeds
       - name: Get Hello World Pod logs
         run: |
           kubectl logs hello-world-pod
+        if: ${{ success() }}  # Only run this step if the previous step was successful
 
-      # Step 7: Cleanup - Delete the Pod
+      # Step 7: Cleanup - Always run this step, even if previous steps fail
       - name: Delete Hello World Pod
         run: |
           kubectl delete pod hello-world-pod
+        if: ${{ always() }}  # Always run this step, even if other steps fail

From 8458d01d83814a68b7f323f25c0890f777ccf6dc Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 11 Oct 2024 12:52:23 -0500
Subject: [PATCH 11/96] Hack out initial du

---
 .github/manifests/disk-usage-report-job.yaml | 23 ++++++
 .github/scripts/du.py                        | 55 ++++++++++++++
 .github/workflows/report.yaml                | 77 ++++++++++++++------
 NEXTSTEPS                                    | 32 ++++++++
 images/Dockerfile.dandihub_report_generator  | 15 ++++
 5 files changed, 179 insertions(+), 23 deletions(-)
 create mode 100644 .github/manifests/disk-usage-report-job.yaml
 create mode 100755 .github/scripts/du.py
 create mode 100644 NEXTSTEPS
 create mode 100644 images/Dockerfile.dandihub_report_generator

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
new file mode 100644
index 00000000..2c945365
--- /dev/null
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -0,0 +1,23 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: disk-usage-report-job
+spec:
+  template:
+    metadata:
+      labels:
+        app: disk-usage-report
+    spec:
+      containers:
+        - name: disk-usage-report
+          image: IMAGE_PLACEHOLDER
+      restartPolicy: Never
+      nodeSelector:
+        NodeGroupType: default
+        NodePool: default
+        hub.jupyter.org/node-purpose: user
+      tolerations:
+        - key: "hub.jupyter.org/dedicated"
+          operator: "Equal"
+          value: "user"
+          effect: "NoSchedule"
diff --git a/.github/scripts/du.py b/.github/scripts/du.py
new file mode 100755
index 00000000..29bccada
--- /dev/null
+++ b/.github/scripts/du.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+
+import os
+import subprocess
+import sys
+import json
+
+OUTPUT_FILE = "du_report.json"
+SIZE_THRESHOLD_GB = 1
+SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024
+
+# Function to calculate disk usage of a directory in bytes
+def get_disk_usage_bytes(path):
+    result = subprocess.run(['du', '-sb', path], capture_output=True, text=True)
+    size_str = result.stdout.split()[0]  # Get the size in bytes (du -sb gives size in bytes)
+    return int(size_str)
+
+# Function to convert bytes to a human-readable format (e.g., KB, MB, GB)
+def bytes_to_human_readable(size_in_bytes):
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if size_in_bytes < 1024:
+            return f"{size_in_bytes:.2f} {unit}"
+        size_in_bytes /= 1024
+
+def prepare_report(directory):
+    report = {}
+    # List user home dirs in the directory and calculate disk usage
+    for user_dir in os.listdir(directory):
+        user_path = os.path.join(directory, user_dir)
+        if os.path.isdir(user_path):
+            disk_usage_bytes = get_disk_usage_bytes(user_path)
+            report[user_dir] = {
+                "disk_usage_bytes": disk_usage_bytes
+            }
+            if disk_usage_bytes > SIZE_THRESHOLD_BYTES:
+                # TODO: Placeholder for other actions
+                report[user_dir]["action"] = f"Directory size exceeds {SIZE_THRESHOLD_BYTES / (1024**3):.2f}GB, further action taken."
+            else:
+                report[user_dir]["action"] = "No action required."
+
+    for user, data in report.items():
+        data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"])
+
+    with open(OUTPUT_FILE, 'w') as f:
+        json.dump(report, f, indent=4)
+
+    print(f"Disk usage report generated at {OUTPUT_FILE}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: du.py <directory_to_check>")
+    else:
+        directory = sys.argv[1]
+        prepare_report(directory)
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index bc1d0882..20ea8155 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -13,52 +13,83 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v3
 
+      - name: Log in to DockerHub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v3
+        with:
+          context: .
+          file: images/Dockerfile.dandihub_report_generator
+          push: true
+          tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub_report_generator:latest
+
       - name: Configure AWS credentials
         uses: aws-actions/configure-aws-credentials@v3
         with:
           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          # TODO param region
           aws-region: us-east-2
 
-      - name: Assume JupyterhubProvisioningRole
-        # TODO param ProvisioningRoleARN and name ^
+      - name: Assume ProvisioningRole
         run: |
-          ROLE_ARN="arn:aws:iam::278212569472:role/JupyterhubProvisioningRole"
-          CREDS=$(aws sts assume-role --role-arn $ROLE_ARN --role-session-name "GitHubActionsSession")
+          CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
           export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
           export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
           export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
 
-
       - name: Configure kubectl with AWS EKS
-        # TODO param name, region role-arn
         run: |
-          aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn arn:aws:iam::278212569472:role/JupyterhubProvisioningRole
+          aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
 
+      # TODO remove
       - name: Sanity check
         run: |
           kubectl get pods -n jupyterhub
 
-      # Step 4: Deploy Hello World Pod from manifest
-      - name: Deploy Hello World Pod
+      - name: Replace image placeholder in manifest
+        run: |
+          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+
+      - name: Deploy Disk Usage Report Job
+        run: |
+          kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+
+      # TODO should timeout be longer?
+      - name: Wait for Disk Usage Report Job to complete
+        run: |
+          kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
+        continue-on-error: true
+
+      - name: Save Pod logs to file
+        run: |
+          POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
+          kubectl logs $POD_NAME > disk_usage_report.log
+        continue-on-error: true
+
+      # continue-on-error for previous steps so we delete the job
+      - name: Delete Disk Usage Report Job
         run: |
-          kubectl apply -f .github/manifests/hello-world-pod.yaml
+          kubectl delete job disk-usage-report-job
 
-      # Step 5: Wait for Pod to Complete
-      - name: Wait for Hello World Pod to complete
+      - name: Clone dandi-hub-usage-reports repository
         run: |
-          kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
-        continue-on-error: true  # Allow the workflow to continue even if this step fails
+          git clone https://github.com/dandi/dandi-hub-usage-reports.git
+          cd dandi-hub-usage-reports
 
-      # Step 6: Get Pod Logs to verify it ran successfully, only if Step 5 succeeds
-      - name: Get Hello World Pod logs
+      - name: Copy log file to repository
         run: |
-          kubectl logs hello-world-pod
-        if: ${{ success() }}  # Only run this step if the previous step was successful
+          DATE=$(date +'%Y-%m-%d')
+          mv ../disk_usage_report.log $DATE_disk_usage_report.log
 
-      # Step 7: Cleanup - Always run this step, even if previous steps fail
-      - name: Delete Hello World Pod
+      # Step 13: Commit and push logs to the repository
+      - name: Commit and push logs
         run: |
-          kubectl delete pod hello-world-pod
-        if: ${{ always() }}  # Always run this step, even if other steps fail
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+          git add disk_usage_report.log
+          git commit -m "Add disk usage report log"
+          git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git
diff --git a/NEXTSTEPS b/NEXTSTEPS
new file mode 100644
index 00000000..44d01774
--- /dev/null
+++ b/NEXTSTEPS
@@ -0,0 +1,32 @@
+DONE
+  - Set AWS_ROLE ARN secret
+  - AWS_ACCESS_KEY_ID
+  - AWS_SECRET_ACCESS_KEY
+
+TODO:
+  - Create Dockerhub Service account
+    - set username & token as secrets
+  - Create Github CI account
+  - Docker Image Tagging:
+    - The Docker image is tagged with latest. For better version control, consider using commit SHA or version numbers.
+  - Log Retrieval:
+    - The logs from the pod are retrieved to help you verify the script's output.
+  - Cleanup:
+    - Deleting the Job ensures that no resources are left running after the workflow completes.
+
+By making these updates, your workflow will now:
+
+    Include your du.py script in a Docker image.
+    Build and push this image to DockerHub.
+    Deploy a Kubernetes Job to your EKS cluster that runs the script.
+    Wait for the Job to complete and retrieve logs.
+    Clean up resources after execution.
+
+Feel free to ask if you need further assistance or clarification on any of these steps!
+
+
+- Get image pushing
+- create private gh repository under dandi org for reports
+
+
+
diff --git a/images/Dockerfile.dandihub_report_generator b/images/Dockerfile.dandihub_report_generator
new file mode 100644
index 00000000..5f460084
--- /dev/null
+++ b/images/Dockerfile.dandihub_report_generator
@@ -0,0 +1,15 @@
+FROM python:3.9-slim
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the du.py script into the container
+COPY .github/scripts/du.py /app/du.py
+
+# Install required packages
+RUN apt-get update \
+    && apt-get install -y coreutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set the entrypoint to the script
+ENTRYPOINT ["python3", "/app/du.py"]

From 79994557f740d42c24e3122a3d921afaeb91f19d Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 12:38:21 -0600
Subject: [PATCH 12/96] tmp comment out job deployment, test dockerhub build

---
 .github/workflows/report.yaml | 132 +++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 20ea8155..c58da482 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -27,69 +27,69 @@ jobs:
           push: true
           tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub_report_generator:latest
 
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v3
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-
-      - name: Assume ProvisioningRole
-        run: |
-          CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
-          export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
-          export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
-          export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
-
-      - name: Configure kubectl with AWS EKS
-        run: |
-          aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
-
-      # TODO remove
-      - name: Sanity check
-        run: |
-          kubectl get pods -n jupyterhub
-
-      - name: Replace image placeholder in manifest
-        run: |
-          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
-
-      - name: Deploy Disk Usage Report Job
-        run: |
-          kubectl apply -f .github/manifests/disk-usage-report-job.yaml
-
-      # TODO should timeout be longer?
-      - name: Wait for Disk Usage Report Job to complete
-        run: |
-          kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
-        continue-on-error: true
-
-      - name: Save Pod logs to file
-        run: |
-          POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
-          kubectl logs $POD_NAME > disk_usage_report.log
-        continue-on-error: true
-
-      # continue-on-error for previous steps so we delete the job
-      - name: Delete Disk Usage Report Job
-        run: |
-          kubectl delete job disk-usage-report-job
-
-      - name: Clone dandi-hub-usage-reports repository
-        run: |
-          git clone https://github.com/dandi/dandi-hub-usage-reports.git
-          cd dandi-hub-usage-reports
-
-      - name: Copy log file to repository
-        run: |
-          DATE=$(date +'%Y-%m-%d')
-          mv ../disk_usage_report.log $DATE_disk_usage_report.log
-
-      # Step 13: Commit and push logs to the repository
-      - name: Commit and push logs
-        run: |
-          git config --global user.name "GitHub Actions"
-          git config --global user.email "actions@github.com"
-          git add disk_usage_report.log
-          git commit -m "Add disk usage report log"
-          git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git
+      # - name: Configure AWS credentials
+      #   uses: aws-actions/configure-aws-credentials@v3
+      #   with:
+      #     aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+      #     aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      #     aws-region: us-east-2
+      #
+      # - name: Assume ProvisioningRole
+      #   run: |
+      #     CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
+      #     export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
+      #     export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
+      #     export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+      #
+      # - name: Configure kubectl with AWS EKS
+      #   run: |
+      #     aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
+      #
+      # # TODO remove
+      # - name: Sanity check
+      #   run: |
+      #     kubectl get pods -n jupyterhub
+      #
+      # - name: Replace image placeholder in manifest
+      #   run: |
+      #     sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+      #
+      # - name: Deploy Disk Usage Report Job
+      #   run: |
+      #     kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+      #
+      # # TODO should timeout be longer?
+      # - name: Wait for Disk Usage Report Job to complete
+      #   run: |
+      #     kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
+      #   continue-on-error: true
+      #
+      # - name: Save Pod logs to file
+      #   run: |
+      #     POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
+      #     kubectl logs $POD_NAME > disk_usage_report.log
+      #   continue-on-error: true
+      #
+      # # continue-on-error for previous steps so we delete the job
+      # - name: Delete Disk Usage Report Job
+      #   run: |
+      #     kubectl delete job disk-usage-report-job
+      #
+      # - name: Clone dandi-hub-usage-reports repository
+      #   run: |
+      #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
+      #     cd dandi-hub-usage-reports
+      #
+      # - name: Copy log file to repository
+      #   run: |
+      #     DATE=$(date +'%Y-%m-%d')
+      #     mv ../disk_usage_report.log $DATE_disk_usage_report.log
+      #
+      # # Step 13: Commit and push logs to the repository
+      # - name: Commit and push logs
+      #   run: |
+      #     git config --global user.name "GitHub Actions"
+      #     git config --global user.email "actions@github.com"
+      #     git add disk_usage_report.log
+      #     git commit -m "Add disk usage report log"
+      #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git

From d2e65de124902b2b082effcc560658fb260789ff Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 12:42:29 -0600
Subject: [PATCH 13/96] Fixup hyphens for image name

---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index c58da482..9bf5526b 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -25,7 +25,7 @@ jobs:
           context: .
           file: images/Dockerfile.dandihub_report_generator
           push: true
-          tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub_report_generator:latest
+          tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
 
       # - name: Configure AWS credentials
       #   uses: aws-actions/configure-aws-credentials@v3

From 5e9e7df57be53c21c5b8534c4c59b60fd1a0d5bb Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 12:57:24 -0600
Subject: [PATCH 14/96] Write file to output location

---
 .github/scripts/du.py         |   4 +-
 .github/workflows/report.yaml | 132 +++++++++++++++++-----------------
 2 files changed, 68 insertions(+), 68 deletions(-)

diff --git a/.github/scripts/du.py b/.github/scripts/du.py
index 29bccada..260bd074 100755
--- a/.github/scripts/du.py
+++ b/.github/scripts/du.py
@@ -5,7 +5,7 @@
 import sys
 import json
 
-OUTPUT_FILE = "du_report.json"
+OUTPUT_FILE = "/output/du_report.json"
 SIZE_THRESHOLD_GB = 1
 SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024
 
@@ -44,7 +44,7 @@ def prepare_report(directory):
     with open(OUTPUT_FILE, 'w') as f:
         json.dump(report, f, indent=4)
 
-    print(f"Disk usage report generated at {OUTPUT_FILE}")
+    print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}")
 
 
 if __name__ == "__main__":
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 9bf5526b..cd7f8f04 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -27,69 +27,69 @@ jobs:
           push: true
           tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
 
-      # - name: Configure AWS credentials
-      #   uses: aws-actions/configure-aws-credentials@v3
-      #   with:
-      #     aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-      #     aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-      #     aws-region: us-east-2
-      #
-      # - name: Assume ProvisioningRole
-      #   run: |
-      #     CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
-      #     export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
-      #     export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
-      #     export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
-      #
-      # - name: Configure kubectl with AWS EKS
-      #   run: |
-      #     aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
-      #
-      # # TODO remove
-      # - name: Sanity check
-      #   run: |
-      #     kubectl get pods -n jupyterhub
-      #
-      # - name: Replace image placeholder in manifest
-      #   run: |
-      #     sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
-      #
-      # - name: Deploy Disk Usage Report Job
-      #   run: |
-      #     kubectl apply -f .github/manifests/disk-usage-report-job.yaml
-      #
-      # # TODO should timeout be longer?
-      # - name: Wait for Disk Usage Report Job to complete
-      #   run: |
-      #     kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
-      #   continue-on-error: true
-      #
-      # - name: Save Pod logs to file
-      #   run: |
-      #     POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
-      #     kubectl logs $POD_NAME > disk_usage_report.log
-      #   continue-on-error: true
-      #
-      # # continue-on-error for previous steps so we delete the job
-      # - name: Delete Disk Usage Report Job
-      #   run: |
-      #     kubectl delete job disk-usage-report-job
-      #
-      # - name: Clone dandi-hub-usage-reports repository
-      #   run: |
-      #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
-      #     cd dandi-hub-usage-reports
-      #
-      # - name: Copy log file to repository
-      #   run: |
-      #     DATE=$(date +'%Y-%m-%d')
-      #     mv ../disk_usage_report.log $DATE_disk_usage_report.log
-      #
-      # # Step 13: Commit and push logs to the repository
-      # - name: Commit and push logs
-      #   run: |
-      #     git config --global user.name "GitHub Actions"
-      #     git config --global user.email "actions@github.com"
-      #     git add disk_usage_report.log
-      #     git commit -m "Add disk usage report log"
-      #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: Assume ProvisioningRole
+        run: |
+          CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
+          export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
+          export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
+          export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+
+      - name: Configure kubectl with AWS EKS
+        run: |
+          aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
+
+      # TODO remove
+      - name: Sanity check
+        run: |
+          kubectl get pods -n jupyterhub
+
+      - name: Replace image placeholder in manifest
+        run: |
+          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+
+      - name: Deploy Disk Usage Report Job
+        run: |
+          kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+
+      # TODO should timeout be longer?
+      - name: Wait for Disk Usage Report Job to complete
+        run: |
+          kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
+        continue-on-error: true
+
+      - name: Save Pod logs to file
+        run: |
+          POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
+          kubectl logs $POD_NAME > disk_usage_report.log
+        continue-on-error: true
+
+      # continue-on-error for previous steps so we delete the job
+      - name: Delete Disk Usage Report Job
+        run: |
+          kubectl delete job disk-usage-report-job
+
+      - name: Clone dandi-hub-usage-reports repository
+        run: |
+          git clone https://github.com/dandi/dandi-hub-usage-reports.git
+          cd dandi-hub-usage-reports
+
+      - name: Copy log file to repository
+        run: |
+          DATE=$(date +'%Y-%m-%d')
+          mv ../disk_usage_report.log $DATE_disk_usage_report.log
+
+      # Step 13: Commit and push logs to the repository
+      - name: Commit and push logs
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+          git add disk_usage_report.log
+          git commit -m "Add disk usage report log"
+          git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git

From d33973ca4f723915b36a7cd3657e7e35cf61b5af Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 12:59:49 -0600
Subject: [PATCH 15/96] use kubectl cp to retrieve report

---
 .github/workflows/report.yaml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index cd7f8f04..968bd5a3 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -64,10 +64,10 @@ jobs:
           kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
         continue-on-error: true
 
-      - name: Save Pod logs to file
+      - name: Retrieve generated report file
         run: |
           POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
-          kubectl logs $POD_NAME > disk_usage_report.log
+          kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
         continue-on-error: true
 
       # continue-on-error for previous steps so we delete the job
@@ -80,16 +80,16 @@ jobs:
           git clone https://github.com/dandi/dandi-hub-usage-reports.git
           cd dandi-hub-usage-reports
 
-      - name: Copy log file to repository
+      - name: Copy report file to repository
         run: |
           DATE=$(date +'%Y-%m-%d')
-          mv ../disk_usage_report.log $DATE_disk_usage_report.log
+          mv ../du_report.json $DATE_du_report.json
 
-      # Step 13: Commit and push logs to the repository
-      - name: Commit and push logs
+      # Step 13: Commit and push report to the repository
+      - name: Commit and push report
         run: |
           git config --global user.name "GitHub Actions"
           git config --global user.email "actions@github.com"
-          git add disk_usage_report.log
-          git commit -m "Add disk usage report log"
+          git add $DATE_du_report.json
+          git commit -m "Add disk usage report for $DATE"
           git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git

From 98fecbce80682ffa2379a33ac4f0ca40d6c672a2 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:08:57 -0600
Subject: [PATCH 16/96] Combine run blocks to use vars

---
 .github/workflows/report.yaml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 968bd5a3..d6837c30 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -78,16 +78,12 @@ jobs:
       - name: Clone dandi-hub-usage-reports repository
         run: |
           git clone https://github.com/dandi/dandi-hub-usage-reports.git
-          cd dandi-hub-usage-reports
 
-      - name: Copy report file to repository
+      - name: Copy report file to repository, commit and push report
         run: |
+          cd dandi-hub-usage-reports
           DATE=$(date +'%Y-%m-%d')
           mv ../du_report.json $DATE_du_report.json
-
-      # Step 13: Commit and push report to the repository
-      - name: Commit and push report
-        run: |
           git config --global user.name "GitHub Actions"
           git config --global user.email "actions@github.com"
           git add $DATE_du_report.json

From 40ae0e85680458fc6caf3acd3385d55a30a533f6 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:09:29 -0600
Subject: [PATCH 17/96] Mount efs and pass arg to du script

---
 .github/manifests/disk-usage-report-job.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 2c945365..696d8f46 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -11,6 +11,11 @@ spec:
       containers:
         - name: disk-usage-report
           image: IMAGE_PLACEHOLDER
+          args:
+            - "/home/"
+          volumeMounts:
+            - name: persistent-storage
+              mountPath: "/home/"
       restartPolicy: Never
       nodeSelector:
         NodeGroupType: default
@@ -21,3 +26,7 @@ spec:
           operator: "Equal"
           value: "user"
           effect: "NoSchedule"
+      volumes:
+        - name: persistent-storage
+          persistentVolumeClaim:
+            claimName: efs-persist

From 4c978f7a8b050edecf491bab870a9a81454ec876 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:15:08 -0600
Subject: [PATCH 18/96] Comment out repo pushing, lets see if the report runs

---
 .github/workflows/report.yaml | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index d6837c30..a12adbdc 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -68,6 +68,7 @@ jobs:
         run: |
           POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
           kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
+          cat du_report.json
         continue-on-error: true
 
       # continue-on-error for previous steps so we delete the job
@@ -75,17 +76,17 @@ jobs:
         run: |
           kubectl delete job disk-usage-report-job
 
-      - name: Clone dandi-hub-usage-reports repository
-        run: |
-          git clone https://github.com/dandi/dandi-hub-usage-reports.git
-
-      - name: Copy report file to repository, commit and push report
-        run: |
-          cd dandi-hub-usage-reports
-          DATE=$(date +'%Y-%m-%d')
-          mv ../du_report.json $DATE_du_report.json
-          git config --global user.name "GitHub Actions"
-          git config --global user.email "actions@github.com"
-          git add $DATE_du_report.json
-          git commit -m "Add disk usage report for $DATE"
-          git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git
+      # - name: Clone dandi-hub-usage-reports repository
+      #   run: |
+      #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
+      #
+      # - name: Copy report file to repository, commit and push report
+      #   run: |
+      #     cd dandi-hub-usage-reports
+      #     DATE=$(date +'%Y-%m-%d')
+      #     mv ../du_report.json $DATE_du_report.json
+      #     git config --global user.name "GitHub Actions"
+      #     git config --global user.email "actions@github.com"
+      #     git add $DATE_du_report.json
+      #     git commit -m "Add disk usage report for $DATE"
+      #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git

From 6bd7b82c4024bee5ce02821866d7610f6d690469 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:22:11 -0600
Subject: [PATCH 19/96] Restrict job to asmacdo for testing

---
 .github/manifests/disk-usage-report-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 696d8f46..c7f60a4a 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -12,7 +12,7 @@ spec:
         - name: disk-usage-report
           image: IMAGE_PLACEHOLDER
           args:
-            - "/home/"
+            - "/home/asmacdo"
           volumeMounts:
             - name: persistent-storage
               mountPath: "/home/"

From 73c3e80a27e9e0973e02e0f4ac5ccfdc8ad4275f Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:30:46 -0600
Subject: [PATCH 20/96] Sanity check. Just list the directories

---
 .github/scripts/du.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/du.py b/.github/scripts/du.py
index 260bd074..12e0c0fc 100755
--- a/.github/scripts/du.py
+++ b/.github/scripts/du.py
@@ -41,6 +41,7 @@ def prepare_report(directory):
     for user, data in report.items():
         data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"])
 
+
     with open(OUTPUT_FILE, 'w') as f:
         json.dump(report, f, indent=4)
 
@@ -51,5 +52,9 @@ def prepare_report(directory):
     if len(sys.argv) != 2:
         print("Usage: du.py <directory_to_check>")
     else:
-        directory = sys.argv[1]
-        prepare_report(directory)
+        path = sys.argv[1]
+        directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
+
+        with open(OUTPUT_FILE, 'w') as f:
+            f.write("\n".join(directories))
+        # prepare_report(directory)

From 685dfb1df9365c917b248c5b55304da93a0234e6 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:43:17 -0600
Subject: [PATCH 21/96] Job was deployed, but never assigned to node, back to
 sanity check

---
 .github/workflows/report.yaml | 57 +++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index a12adbdc..5889c90c 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -50,32 +50,51 @@ jobs:
         run: |
           kubectl get pods -n jupyterhub
 
-      - name: Replace image placeholder in manifest
+      - name: Deploy Hello World Pod
         run: |
-          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+          kubectl apply -f .github/manifests/hello-world-pod.yaml
 
-      - name: Deploy Disk Usage Report Job
+      - name: Wait for Hello World Pod to complete
         run: |
-          kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+          kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
+        continue-on-error: true  # Allow the workflow to continue even if this step fails
 
-      # TODO should timeout be longer?
-      - name: Wait for Disk Usage Report Job to complete
+      - name: Get Hello World Pod logs
         run: |
-          kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
-        continue-on-error: true
+          kubectl logs hello-world-pod
+        if: ${{ success() }}  # Only run this step if the previous step was successful
 
-      - name: Retrieve generated report file
+      - name: Delete Hello World Pod
         run: |
-          POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
-          kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
-          cat du_report.json
-        continue-on-error: true
-
-      # continue-on-error for previous steps so we delete the job
-      - name: Delete Disk Usage Report Job
-        run: |
-          kubectl delete job disk-usage-report-job
-
+          kubectl delete pod hello-world-pod
+        if: ${{ always() }}  # Always run this step, even if other steps fail
+      #
+      # - name: Replace image placeholder in manifest
+      #   run: |
+      #     sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+      #
+      # - name: Deploy Disk Usage Report Job
+      #   run: |
+      #     kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+      #
+      # # TODO should timeout be longer?
+      # - name: Wait for Disk Usage Report Job to complete
+      #   run: |
+      #     kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
+      #   continue-on-error: true
+      #
+      # - name: Retrieve generated report file
+      #   run: |
+      #     POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
+      #     kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
+      #     cat du_report.json
+      #   continue-on-error: true
+      #
+      # # continue-on-error for previous steps so we delete the job
+      # - name: Delete Disk Usage Report Job
+      #   run: |
+      #     kubectl delete job disk-usage-report-job
+      #
       # - name: Clone dandi-hub-usage-reports repository
       #   run: |
       #     git clone https://github.com/dandi/dandi-hub-usage-reports.git

From f6afefcc1bfd0c9474016b667b46e56bbdd5894f Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 14:55:31 -0600
Subject: [PATCH 22/96] change from job to pod

---
 .github/manifests/disk-usage-report-job.yaml | 51 ++++++--------
 .github/workflows/report.yaml                | 73 ++++++++++----------
 2 files changed, 59 insertions(+), 65 deletions(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index c7f60a4a..36ded0fa 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -1,32 +1,27 @@
-apiVersion: batch/v1
-kind: Job
+apiVersion: v1
+kind: Pod
 metadata:
   name: disk-usage-report-job
 spec:
-  template:
-    metadata:
-      labels:
-        app: disk-usage-report
-    spec:
-      containers:
-        - name: disk-usage-report
-          image: IMAGE_PLACEHOLDER
-          args:
-            - "/home/asmacdo"
-          volumeMounts:
-            - name: persistent-storage
-              mountPath: "/home/"
-      restartPolicy: Never
-      nodeSelector:
-        NodeGroupType: default
-        NodePool: default
-        hub.jupyter.org/node-purpose: user
-      tolerations:
-        - key: "hub.jupyter.org/dedicated"
-          operator: "Equal"
-          value: "user"
-          effect: "NoSchedule"
-      volumes:
+  containers:
+    - name: disk-usage-report
+      image: IMAGE_PLACEHOLDER
+      args:
+        - "/home/asmacdo"
+      volumeMounts:
         - name: persistent-storage
-          persistentVolumeClaim:
-            claimName: efs-persist
+          mountPath: "/home/"
+  restartPolicy: Never
+  nodeSelector:
+    NodeGroupType: default
+    NodePool: default
+    hub.jupyter.org/node-purpose: user
+  tolerations:
+    - key: "hub.jupyter.org/dedicated"
+      operator: "Equal"
+      value: "user"
+      effect: "NoSchedule"
+  volumes:
+    - name: persistent-storage
+      persistentVolumeClaim:
+        claimName: efs-persist
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 5889c90c..50168ff5 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -50,51 +50,50 @@ jobs:
         run: |
           kubectl get pods -n jupyterhub
 
-      - name: Deploy Hello World Pod
-        run: |
-          kubectl apply -f .github/manifests/hello-world-pod.yaml
-
-      - name: Wait for Hello World Pod to complete
-        run: |
-          kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
-        continue-on-error: true  # Allow the workflow to continue even if this step fails
-
-      - name: Get Hello World Pod logs
-        run: |
-          kubectl logs hello-world-pod
-        if: ${{ success() }}  # Only run this step if the previous step was successful
-
-      - name: Delete Hello World Pod
-        run: |
-          kubectl delete pod hello-world-pod
-        if: ${{ always() }}  # Always run this step, even if other steps fail
-      #
-      # - name: Replace image placeholder in manifest
-      #   run: |
-      #     sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
-      #
-      # - name: Deploy Disk Usage Report Job
+      # - name: Deploy Hello World Pod
       #   run: |
-      #     kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+      #     kubectl apply -f .github/manifests/hello-world-pod.yaml
       #
-      # # TODO should timeout be longer?
-      # - name: Wait for Disk Usage Report Job to complete
+      # - name: Wait for Hello World Pod to complete
       #   run: |
-      #     kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=300s
-      #   continue-on-error: true
+      #     kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
+      #   continue-on-error: true  # Allow the workflow to continue even if this step fails
       #
-      # - name: Retrieve generated report file
+      # - name: Get Hello World Pod logs
       #   run: |
-      #     POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
-      #     kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
-      #     cat du_report.json
-      #   continue-on-error: true
+      #     kubectl logs hello-world-pod
+      #   if: ${{ success() }}  # Only run this step if the previous step was successful
       #
-      # # continue-on-error for previous steps so we delete the job
-      # - name: Delete Disk Usage Report Job
+      # - name: Delete Hello World Pod
       #   run: |
-      #     kubectl delete job disk-usage-report-job
+      #     kubectl delete pod hello-world-pod
+      #   if: ${{ always() }}  # Always run this step, even if other steps fail
       #
+      - name: Replace image placeholder in manifest
+        run: |
+          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+
+      - name: Deploy Disk Usage Report Job Pod
+        run: |
+          kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+
+      # TODO should timeout be longer?
+      - name: Wait for Disk Usage Report Job to complete
+        run: |
+          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=300s
+        continue-on-error: true
+
+      - name: Retrieve generated report file
+        run: |
+          kubectl cp disk-usage-report-job:/output/du_report.json du_report.json
+          cat du_report.json
+        continue-on-error: true
+
+      # continue-on-error for previous steps so we delete the job
+      - name: Delete Disk Usage Report Job
+        run: |
+          kubectl delete pod disk-usage-report-job
+
       # - name: Clone dandi-hub-usage-reports repository
       #   run: |
       #     git clone https://github.com/dandi/dandi-hub-usage-reports.git

From 6dad759ce332de988ed6f37a8ca73ee197330574 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:00:07 -0600
Subject: [PATCH 23/96] deploy pod to same namespace as pvc

---
 .github/manifests/disk-usage-report-job.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 36ded0fa..7a5e424c 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -2,6 +2,7 @@ apiVersion: v1
 kind: Pod
 metadata:
   name: disk-usage-report-job
+  namespace: jupyterhub
 spec:
   containers:
     - name: disk-usage-report
@@ -25,3 +26,4 @@ spec:
     - name: persistent-storage
       persistentVolumeClaim:
         claimName: efs-persist
+

From 3a339374cfd5e1280194815055ac3d9207cc4dfb Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:04:29 -0600
Subject: [PATCH 24/96] Use ns in action

---
 .github/workflows/report.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 50168ff5..de0ad877 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -80,19 +80,19 @@ jobs:
       # TODO should timeout be longer?
       - name: Wait for Disk Usage Report Job to complete
         run: |
-          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=300s
+          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=30s -n jupyterhub
         continue-on-error: true
 
       - name: Retrieve generated report file
         run: |
-          kubectl cp disk-usage-report-job:/output/du_report.json du_report.json
+          kubectl cp disk-usage-report-job:/output/du_report.json du_report.json -n jupyterhub
           cat du_report.json
         continue-on-error: true
 
       # continue-on-error for previous steps so we delete the job
       - name: Delete Disk Usage Report Job
         run: |
-          kubectl delete pod disk-usage-report-job
+          kubectl delete pod disk-usage-report-job -n jupyterhub
 
       # - name: Clone dandi-hub-usage-reports repository
       #   run: |

From 1ffb1c90d3129db6adfb76c03bd8a2b5fac98a61 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:09:33 -0600
Subject: [PATCH 25/96] increase timeout to 60s

job shouldnt take that long, but this is wall time, includes docker pull, etc
---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index de0ad877..4bcf1b43 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -80,7 +80,7 @@ jobs:
       # TODO should timeout be longer?
       - name: Wait for Disk Usage Report Job to complete
         run: |
-          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=30s -n jupyterhub
+          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=60s -n jupyterhub
         continue-on-error: true
 
       - name: Retrieve generated report file

From 58e0753db9e53c981c11a86b3ed0700b4c77a6a7 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:15:26 -0600
Subject: [PATCH 26/96] fixup: image name in manifest

---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 4bcf1b43..170841f5 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -71,7 +71,7 @@ jobs:
       #
       - name: Replace image placeholder in manifest
         run: |
-          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/disk_usage_report:latest"'|' .github/manifests/disk-usage-report-job.yaml
+          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
 
       - name: Deploy Disk Usage Report Job Pod
         run: |

From 676775579437f09c77b4eb379af6319a8906e395 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:18:53 -0600
Subject: [PATCH 27/96] increase timeout to 150

took almost 60 sec to start up
---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 170841f5..04b264c1 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -80,7 +80,7 @@ jobs:
       # TODO should timeout be longer?
       - name: Wait for Disk Usage Report Job to complete
         run: |
-          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=60s -n jupyterhub
+          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=150s -n jupyterhub
         continue-on-error: true
 
       - name: Retrieve generated report file

From cbf951e7f821ece4bd71d6f34a60b0f4c90ee5b6 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:29:59 -0600
Subject: [PATCH 28/96] override entrypoint so i can debug with exec

---
 .github/manifests/disk-usage-report-job.yaml | 5 +++--
 .github/workflows/report.yaml                | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 7a5e424c..e2e81a37 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -7,8 +7,9 @@ spec:
   containers:
     - name: disk-usage-report
       image: IMAGE_PLACEHOLDER
-      args:
-        - "/home/asmacdo"
+      command: ["/bin/sh", "-c", "sleep 300"]
+      # args:
+      #   - "/home/asmacdo"
       volumeMounts:
         - name: persistent-storage
           mountPath: "/home/"
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 04b264c1..34241ee5 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -80,7 +80,7 @@ jobs:
       # TODO should timeout be longer?
       - name: Wait for Disk Usage Report Job to complete
         run: |
-          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=150s -n jupyterhub
+          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=360s -n jupyterhub
         continue-on-error: true
 
       - name: Retrieve generated report file

From 59eb04549e9e759d21da71a48010a5c370466e4d Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:31:45 -0600
Subject: [PATCH 29/96] bound /home actually meant path was /home/home/asmacdo

---
 .github/manifests/disk-usage-report-job.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index e2e81a37..81ff08da 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -7,12 +7,12 @@ spec:
   containers:
     - name: disk-usage-report
       image: IMAGE_PLACEHOLDER
-      command: ["/bin/sh", "-c", "sleep 300"]
-      # args:
-      #   - "/home/asmacdo"
+      args:
+        - "/home/asmacdo"
       volumeMounts:
         - name: persistent-storage
-          mountPath: "/home/"
+          mountPath: "/home"
+          subPath: "home"
   restartPolicy: Never
   nodeSelector:
     NodeGroupType: default

From db140d55eb1963fc60284bec37a84bd0105ec7c3 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 8 Nov 2024 15:43:35 -0600
Subject: [PATCH 30/96] Create output dir prior to writing report

---
 .github/scripts/du.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/du.py b/.github/scripts/du.py
index 12e0c0fc..0b2ceb72 100755
--- a/.github/scripts/du.py
+++ b/.github/scripts/du.py
@@ -41,7 +41,7 @@ def prepare_report(directory):
     for user, data in report.items():
         data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"])
 
-
+    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
     with open(OUTPUT_FILE, 'w') as f:
         json.dump(report, f, indent=4)
 
@@ -55,6 +55,7 @@ def prepare_report(directory):
         path = sys.argv[1]
         directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
 
+        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
         with open(OUTPUT_FILE, 'w') as f:
             f.write("\n".join(directories))
         # prepare_report(directory)

From f90176adebd7339e3a4f55ba79e765b502895896 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 08:54:57 -0600
Subject: [PATCH 31/96] pod back to job

---
 .github/manifests/disk-usage-report-job.yaml | 54 +++++++++++---------
 .github/workflows/report.yaml                |  9 ++--
 2 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 81ff08da..387aead0 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -1,30 +1,34 @@
-apiVersion: v1
-kind: Pod
+apiVersion: v1/batch
+kind: Job
 metadata:
   name: disk-usage-report-job
   namespace: jupyterhub
 spec:
-  containers:
-    - name: disk-usage-report
-      image: IMAGE_PLACEHOLDER
-      args:
-        - "/home/asmacdo"
-      volumeMounts:
+  template:
+    metadata:
+      labels:
+        app: disk-usage-report
+    spec:
+      containers:
+        - name: disk-usage-report
+          image: IMAGE_PLACEHOLDER
+          args:
+            - "/home/asmacdo"
+          volumeMounts:
+            - name: persistent-storage
+              mountPath: "/home"
+              subPath: "home"
+      restartPolicy: Never
+      nodeSelector:
+        NodeGroupType: default
+        NodePool: default
+        hub.jupyter.org/node-purpose: user
+      tolerations:
+        - key: "hub.jupyter.org/dedicated"
+          operator: "Equal"
+          value: "user"
+          effect: "NoSchedule"
+      volumes:
         - name: persistent-storage
-          mountPath: "/home"
-          subPath: "home"
-  restartPolicy: Never
-  nodeSelector:
-    NodeGroupType: default
-    NodePool: default
-    hub.jupyter.org/node-purpose: user
-  tolerations:
-    - key: "hub.jupyter.org/dedicated"
-      operator: "Equal"
-      value: "user"
-      effect: "NoSchedule"
-  volumes:
-    - name: persistent-storage
-      persistentVolumeClaim:
-        claimName: efs-persist
-
+          persistentVolumeClaim:
+            claimName: efs-persist
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 34241ee5..1c2692ad 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -73,26 +73,27 @@ jobs:
         run: |
           sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
 
-      - name: Deploy Disk Usage Report Job Pod
+      - name: Deploy Disk Usage Report Job
         run: |
           kubectl apply -f .github/manifests/disk-usage-report-job.yaml
 
       # TODO should timeout be longer?
       - name: Wait for Disk Usage Report Job to complete
         run: |
-          kubectl wait --for=condition=complete pod/disk-usage-report-job --timeout=360s -n jupyterhub
+          kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
         continue-on-error: true
 
       - name: Retrieve generated report file
         run: |
-          kubectl cp disk-usage-report-job:/output/du_report.json du_report.json -n jupyterhub
+          POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
+          kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
           cat du_report.json
         continue-on-error: true
 
       # continue-on-error for previous steps so we delete the job
       - name: Delete Disk Usage Report Job
         run: |
-          kubectl delete pod disk-usage-report-job -n jupyterhub
+          kubectl delete job disk-usage-report-job -n jupyterhub
 
       # - name: Clone dandi-hub-usage-reports repository
       #   run: |

From c31ccddc2667a94a00cd3b8bfcf051347b09bf12 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 09:18:16 -0600
Subject: [PATCH 32/96] Fixup use the correct job api

---
 .github/manifests/disk-usage-report-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 387aead0..ed2bcde0 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -1,4 +1,4 @@
-apiVersion: v1/batch
+apiVersion: batch/v1
 kind: Job
 metadata:
   name: disk-usage-report-job

From 3ee9d9f1397550e7eba26c92412576bd895cd39b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 09:27:49 -0600
Subject: [PATCH 33/96] Add namespace to pod retrieval

---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 1c2692ad..0cbe18d4 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -85,7 +85,7 @@ jobs:
 
       - name: Retrieve generated report file
         run: |
-          POD_NAME=$(kubectl get pods --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
+          POD_NAME=$(kubectl get pods -n jupyterhub --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
           kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
           cat du_report.json
         continue-on-error: true

From d7f81ba112fb9be0f039165c8e8cbf89b1d95666 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 10:38:43 -0600
Subject: [PATCH 34/96] write directly to pv to test job

---
 .github/scripts/du.py         | 19 +++++++++++--------
 .github/workflows/report.yaml |  7 -------
 2 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/.github/scripts/du.py b/.github/scripts/du.py
index 0b2ceb72..1c3a4033 100755
--- a/.github/scripts/du.py
+++ b/.github/scripts/du.py
@@ -4,8 +4,9 @@
 import subprocess
 import sys
 import json
+from datetime import date
 
-OUTPUT_FILE = "/output/du_report.json"
+OUTPUT_DIR = "/home/asmacdo/du_reports/"
 SIZE_THRESHOLD_GB = 1
 SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024
 
@@ -41,11 +42,12 @@ def prepare_report(directory):
     for user, data in report.items():
         data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"])
 
-    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
-    with open(OUTPUT_FILE, 'w') as f:
-        json.dump(report, f, indent=4)
-
-    print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}")
+    # os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
+    # output_file =
+    # with open(OUTPUT_FILE, 'w') as f:
+    #     json.dump(report, f, indent=4)
+    #
+    # print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}")
 
 
 if __name__ == "__main__":
@@ -55,7 +57,8 @@ def prepare_report(directory):
         path = sys.argv[1]
         directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
 
-        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
-        with open(OUTPUT_FILE, 'w') as f:
+        os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
+        current_date = date.today().strftime('%Y-%m-%d')
+        with open(f"OUTPUT_DIR/{current_date}.json", "w") as f:
             f.write("\n".join(directories))
         # prepare_report(directory)
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 0cbe18d4..acb9dc1e 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -83,13 +83,6 @@ jobs:
           kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
         continue-on-error: true
 
-      - name: Retrieve generated report file
-        run: |
-          POD_NAME=$(kubectl get pods -n jupyterhub --selector=job-name=disk-usage-report-job -o jsonpath='{.items[0].metadata.name}')
-          kubectl cp $POD_NAME:/output/du_report.json du_report.json -n jupyterhub
-          cat du_report.json
-        continue-on-error: true
-
       # continue-on-error for previous steps so we delete the job
       - name: Delete Disk Usage Report Job
         run: |

From 0856baaa923dc18709ae78e4ab706570bfb9df90 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 10:47:13 -0600
Subject: [PATCH 35/96] fixup script fstring

---
 .github/scripts/du.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/du.py b/.github/scripts/du.py
index 1c3a4033..05043ef6 100755
--- a/.github/scripts/du.py
+++ b/.github/scripts/du.py
@@ -59,6 +59,6 @@ def prepare_report(directory):
 
         os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
         current_date = date.today().strftime('%Y-%m-%d')
-        with open(f"OUTPUT_DIR/{current_date}.json", "w") as f:
+        with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f:
             f.write("\n".join(directories))
         # prepare_report(directory)

From 5301b1b6bce93f968525752b3ea1da19669a3e52 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 10:47:37 -0600
Subject: [PATCH 36/96] no retry on failure, we were spinning up 5 pods, lets
 just fail 1 time

---
 .github/manifests/disk-usage-report-job.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index ed2bcde0..b487280c 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -9,6 +9,7 @@ spec:
       labels:
         app: disk-usage-report
     spec:
+      backoffLimit: 0  # No retry on failure
       containers:
         - name: disk-usage-report
           image: IMAGE_PLACEHOLDER

From 738427405935694ecdf874b2c3ce8eeced6340c6 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 10:49:22 -0600
Subject: [PATCH 37/96] Fixup backup limit job not template

---
 .github/manifests/disk-usage-report-job.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index b487280c..adf966e0 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -4,12 +4,12 @@ metadata:
   name: disk-usage-report-job
   namespace: jupyterhub
 spec:
+  backoffLimit: 0  # No retry on failure
   template:
     metadata:
       labels:
         app: disk-usage-report
     spec:
-      backoffLimit: 0  # No retry on failure
       containers:
         - name: disk-usage-report
           image: IMAGE_PLACEHOLDER

From 8e81e381359fc74c98a7a695288c9939f12a4041 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 10:52:54 -0600
Subject: [PATCH 38/96] Initial report

---
 .github/manifests/disk-usage-report-job.yaml |  2 +-
 .github/scripts/du.py                        | 19 ++++++-------------
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index adf966e0..488735e6 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -14,7 +14,7 @@ spec:
         - name: disk-usage-report
           image: IMAGE_PLACEHOLDER
           args:
-            - "/home/asmacdo"
+            - "/home/"
           volumeMounts:
             - name: persistent-storage
               mountPath: "/home"
diff --git a/.github/scripts/du.py b/.github/scripts/du.py
index 05043ef6..e6267690 100755
--- a/.github/scripts/du.py
+++ b/.github/scripts/du.py
@@ -42,12 +42,11 @@ def prepare_report(directory):
     for user, data in report.items():
         data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"])
 
-    # os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
-    # output_file =
-    # with open(OUTPUT_FILE, 'w') as f:
-    #     json.dump(report, f, indent=4)
-    #
-    # print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}")
+    os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
+    current_date = date.today().strftime('%Y-%m-%d')
+    with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f:
+        json.dump(report, f, indent=4)
+    print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}")
 
 
 if __name__ == "__main__":
@@ -55,10 +54,4 @@ def prepare_report(directory):
         print("Usage: du.py <directory_to_check>")
     else:
         path = sys.argv[1]
-        directories = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
-
-        os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
-        current_date = date.today().strftime('%Y-%m-%d')
-        with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f:
-            f.write("\n".join(directories))
-        # prepare_report(directory)
+        prepare_report(path)

From cb5db493dae21473dfd3c764b457a1826f988a29 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 11 Nov 2024 12:35:01 -0600
Subject: [PATCH 39/96] disable report

see PR for comment
---
 .github/manifests/disk-usage-report-job.yaml |   2 +-
 .github/workflows/report.yaml                | 208 +++++++++----------
 2 files changed, 105 insertions(+), 105 deletions(-)

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
index 488735e6..161f1778 100644
--- a/.github/manifests/disk-usage-report-job.yaml
+++ b/.github/manifests/disk-usage-report-job.yaml
@@ -12,7 +12,7 @@ spec:
     spec:
       containers:
         - name: disk-usage-report
-          image: IMAGE_PLACEHOLDER
+          image: dandiarchive/dandihub-report-generator:latest
           args:
             - "/home/"
           volumeMounts:
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index acb9dc1e..8424fd6c 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -1,104 +1,104 @@
-name: Generate Data Usage Report
-
-on:
-  pull_request:
-    branches:
-      - main
-
-jobs:
-  generate_data_usage_report:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-
-      - name: Log in to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v3
-        with:
-          context: .
-          file: images/Dockerfile.dandihub_report_generator
-          push: true
-          tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
-
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v3
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-
-      - name: Assume ProvisioningRole
-        run: |
-          CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
-          export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
-          export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
-          export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
-
-      - name: Configure kubectl with AWS EKS
-        run: |
-          aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
-
-      # TODO remove
-      - name: Sanity check
-        run: |
-          kubectl get pods -n jupyterhub
-
-      # - name: Deploy Hello World Pod
-      #   run: |
-      #     kubectl apply -f .github/manifests/hello-world-pod.yaml
-      #
-      # - name: Wait for Hello World Pod to complete
-      #   run: |
-      #     kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
-      #   continue-on-error: true  # Allow the workflow to continue even if this step fails
-      #
-      # - name: Get Hello World Pod logs
-      #   run: |
-      #     kubectl logs hello-world-pod
-      #   if: ${{ success() }}  # Only run this step if the previous step was successful
-      #
-      # - name: Delete Hello World Pod
-      #   run: |
-      #     kubectl delete pod hello-world-pod
-      #   if: ${{ always() }}  # Always run this step, even if other steps fail
-      #
-      - name: Replace image placeholder in manifest
-        run: |
-          sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
-
-      - name: Deploy Disk Usage Report Job
-        run: |
-          kubectl apply -f .github/manifests/disk-usage-report-job.yaml
-
-      # TODO should timeout be longer?
-      - name: Wait for Disk Usage Report Job to complete
-        run: |
-          kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
-        continue-on-error: true
-
-      # continue-on-error for previous steps so we delete the job
-      - name: Delete Disk Usage Report Job
-        run: |
-          kubectl delete job disk-usage-report-job -n jupyterhub
-
-      # - name: Clone dandi-hub-usage-reports repository
-      #   run: |
-      #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
-      #
-      # - name: Copy report file to repository, commit and push report
-      #   run: |
-      #     cd dandi-hub-usage-reports
-      #     DATE=$(date +'%Y-%m-%d')
-      #     mv ../du_report.json $DATE_du_report.json
-      #     git config --global user.name "GitHub Actions"
-      #     git config --global user.email "actions@github.com"
-      #     git add $DATE_du_report.json
-      #     git commit -m "Add disk usage report for $DATE"
-      #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git
+# name: Generate Data Usage Report
+#
+# on:
+#   pull_request:
+#     branches:
+#       - main
+#
+# jobs:
+#   generate_data_usage_report:
+#     runs-on: ubuntu-latest
+#
+#     steps:
+#       - name: Checkout code
+#         uses: actions/checkout@v3
+#
+#       - name: Log in to DockerHub
+#         uses: docker/login-action@v2
+#         with:
+#           username: ${{ secrets.DOCKERHUB_USERNAME }}
+#           password: ${{ secrets.DOCKERHUB_TOKEN }}
+#
+#       - name: Build and push Docker image
+#         uses: docker/build-push-action@v3
+#         with:
+#           context: .
+#           file: images/Dockerfile.dandihub_report_generator
+#           push: true
+#           tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
+#
+#       - name: Configure AWS credentials
+#         uses: aws-actions/configure-aws-credentials@v3
+#         with:
+#           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+#           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+#           aws-region: us-east-2
+#
+#       - name: Assume ProvisioningRole
+#         run: |
+#           CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
+#           export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
+#           export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
+#           export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+#
+#       - name: Configure kubectl with AWS EKS
+#         run: |
+#           aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
+#
+#       # TODO remove
+#       - name: Sanity check
+#         run: |
+#           kubectl get pods -n jupyterhub
+#
+#       # - name: Deploy Hello World Pod
+#       #   run: |
+#       #     kubectl apply -f .github/manifests/hello-world-pod.yaml
+#       #
+#       # - name: Wait for Hello World Pod to complete
+#       #   run: |
+#       #     kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
+#       #   continue-on-error: true  # Allow the workflow to continue even if this step fails
+#       #
+#       # - name: Get Hello World Pod logs
+#       #   run: |
+#       #     kubectl logs hello-world-pod
+#       #   if: ${{ success() }}  # Only run this step if the previous step was successful
+#       #
+#       # - name: Delete Hello World Pod
+#       #   run: |
+#       #     kubectl delete pod hello-world-pod
+#       #   if: ${{ always() }}  # Always run this step, even if other steps fail
+#       #
+#       - name: Replace image placeholder in manifest
+#         run: |
+#           sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
+#
+#       - name: Deploy Disk Usage Report Job
+#         run: |
+#           kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+#
+#       # TODO should timeout be longer?
+#       - name: Wait for Disk Usage Report Job to complete
+#         run: |
+#           kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
+#         continue-on-error: true
+#
+#       # continue-on-error for previous steps so we delete the job
+#       - name: Delete Disk Usage Report Job
+#         run: |
+#           kubectl delete job disk-usage-report-job -n jupyterhub
+#
+#       # - name: Clone dandi-hub-usage-reports repository
+#       #   run: |
+#       #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
+#       #
+#       # - name: Copy report file to repository, commit and push report
+#       #   run: |
+#       #     cd dandi-hub-usage-reports
+#       #     DATE=$(date +'%Y-%m-%d')
+#       #     mv ../du_report.json $DATE_du_report.json
+#       #     git config --global user.name "GitHub Actions"
+#       #     git config --global user.email "actions@github.com"
+#       #     git add $DATE_du_report.json
+#       #     git commit -m "Add disk usage report for $DATE"
+#       #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git

From 5d188a7b6fd317ccfcb40a5fbc68f9c59c6fcbb7 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 2 Dec 2024 11:45:45 -0600
Subject: [PATCH 40/96] deploy ec2 instance directly

---
 .github/workflows/report.yaml | 74 ++++++++++++++++++++++++++++++-----
 1 file changed, 65 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 8424fd6c..147aa65f 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -1,17 +1,73 @@
-# name: Generate Data Usage Report
-#
-# on:
-#   pull_request:
-#     branches:
-#       - main
-#
+---
+name: Generate Data Usage Report
+
+on:
+  pull_request:
+    branches:
+      - main
+
+
+jobs:
+  generate-jobs-usage-report:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-2
+
+      - name: Launch EC2 Instance
+        id: launch_ec2
+        run: |
+          INSTANCE_ID=$(aws ec2 run-instances \
+            --image-id ami-0c02fb55956c7d316 \
+            --count 1 \
+            --instance-type t3.micro \
+            --key-name dandihub-gh-actions \
+            --security-group-ids sg-xxxxxxxx \
+            --subnet-id subnet-xxxxxxxx \
+            --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=EC2dfTest}]" \
+            --query 'Instances[0].InstanceId' --output text)
+
+          echo "INSTANCE_ID=${INSTANCE_ID}" >> $GITHUB_ENV
+
+      - name: Wait for EC2 to Initialize
+        run: |
+          aws ec2 wait instance-status-ok --instance-ids ${{ env.INSTANCE_ID }}
+
+      - name: Retrieve EC2 Public IP
+        id: get_ip
+        run: |
+          PUBLIC_IP=$(aws ec2 describe-instances \
+            --instance-ids ${{ env.INSTANCE_ID }} \
+            --query 'Reservations[0].Instances[0].PublicIpAddress' --output text)
+
+          echo "PUBLIC_IP=${PUBLIC_IP}" >> $GITHUB_ENV
+
+      - name: Execute df Command on EC2
+        uses: appleboy/ssh-action@v0.1.6
+        with:
+          host: ${{ env.PUBLIC_IP }}
+          username: ec2-user
+          key: ${{ secrets.EC2_SSH_KEY }}
+          script: |
+            echo "Running df command on EC2 instance..."
+            df -h
+            echo "Command completed."
+
+      - name: Terminate EC2 Instance
+        run: |
+          aws ec2 terminate-instances --instance-ids ${{ env.INSTANCE_ID }}
+          aws ec2 wait instance-terminated --instance-ids ${{ env.INSTANCE_ID }}
+
 # jobs:
 #   generate_data_usage_report:
 #     runs-on: ubuntu-latest
 #
 #     steps:
-#       - name: Checkout code
-#         uses: actions/checkout@v3
 #
 #       - name: Log in to DockerHub
 #         uses: docker/login-action@v2

From 2f39e9cded31875b74e7123e5fed3a9e99afbe89 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 2 Dec 2024 11:51:21 -0600
Subject: [PATCH 41/96] Update AMI image

---
 .github/workflows/report.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 147aa65f..b1983a00 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -23,7 +23,7 @@ jobs:
         id: launch_ec2
         run: |
           INSTANCE_ID=$(aws ec2 run-instances \
-            --image-id ami-0c02fb55956c7d316 \
+            --image-id ami-088d38b423bff245f \
             --count 1 \
             --instance-type t3.micro \
             --key-name dandihub-gh-actions \

From 3a211063fe3f149dc13489e4266360f95f73853b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 2 Dec 2024 12:03:40 -0600
Subject: [PATCH 42/96] update sg and subnet

---
 .github/workflows/report.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index b1983a00..a9aa6a8b 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -27,9 +27,9 @@ jobs:
             --count 1 \
             --instance-type t3.micro \
             --key-name dandihub-gh-actions \
-            --security-group-ids sg-xxxxxxxx \
-            --subnet-id subnet-xxxxxxxx \
-            --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=EC2dfTest}]" \
+            --security-group-ids sg-0bf2dc1c2ff9c122e \
+            --subnet-id subnet-0f544cca61ccd2804 \
+            --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
             --query 'Instances[0].InstanceId' --output text)
 
           echo "INSTANCE_ID=${INSTANCE_ID}" >> $GITHUB_ENV

From 6a54da0fd0420b24541e87990efae9404c8bfa3b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 2 Dec 2024 12:09:18 -0600
Subject: [PATCH 43/96] terminate even if job fails

---
 .github/workflows/report.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index a9aa6a8b..f796a187 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -57,6 +57,7 @@ jobs:
             echo "Running df command on EC2 instance..."
             df -h
             echo "Command completed."
+        continue-on-error: true  # Allow the workflow to continue even if this step fails
 
       - name: Terminate EC2 Instance
         run: |

From 87075fb463303dff8c96cb8b6172e03fef0331e2 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 2 Dec 2024 12:17:55 -0600
Subject: [PATCH 44/96] debug: print public ip

---
 .github/workflows/report.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index f796a187..41f917e8 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -45,6 +45,7 @@ jobs:
             --instance-ids ${{ env.INSTANCE_ID }} \
             --query 'Reservations[0].Instances[0].PublicIpAddress' --output text)
 
+          echo "${PUBLIC_IP}"
           echo "PUBLIC_IP=${PUBLIC_IP}" >> $GITHUB_ENV
 
       - name: Execute df Command on EC2
@@ -57,7 +58,7 @@ jobs:
             echo "Running df command on EC2 instance..."
             df -h
             echo "Command completed."
-        continue-on-error: true  # Allow the workflow to continue even if this step fails
+        # continue-on-error: true  # Allow the workflow to continue even if this step fails
 
       - name: Terminate EC2 Instance
         run: |

From 48c7f356528c71e0759edfe40bea909e4401c46b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 2 Dec 2024 12:29:39 -0600
Subject: [PATCH 45/96] explicitly allocate public ip for ec2 instance

---
 .github/workflows/report.yaml | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 41f917e8..67f4f6ef 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -38,14 +38,28 @@ jobs:
         run: |
           aws ec2 wait instance-status-ok --instance-ids ${{ env.INSTANCE_ID }}
 
-      - name: Retrieve EC2 Public IP
-        id: get_ip
+      - name: Allocate Elastic IP
+        id: allocate_eip
+        run: |
+          ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
+          echo "ALLOC_ID=${ALLOC_ID}" >> $GITHUB_ENV
+
+      - name: Associate Elastic IP with EC2 Instance
+        id: associate_eip
         run: |
-          PUBLIC_IP=$(aws ec2 describe-instances \
-            --instance-ids ${{ env.INSTANCE_ID }} \
-            --query 'Reservations[0].Instances[0].PublicIpAddress' --output text)
+          EIP=$(aws ec2 associate-address \
+            --instance-id ${{ env.INSTANCE_ID }} \
+            --allocation-id ${{ env.ALLOC_ID }} \
+            --query 'AssociationId' --output text)
+          echo "EIP=${EIP}" >> $GITHUB_ENV
 
-          echo "${PUBLIC_IP}"
+      - name: Retrieve Elastic IP Address
+        id: get_ip
+        run: |
+          PUBLIC_IP=$(aws ec2 describe-addresses \
+            --allocation-ids ${{ env.ALLOC_ID }} \
+            --query 'Addresses[0].PublicIp' --output text)
+          echo "PUBLIC_IP=${PUBLIC_IP}"
           echo "PUBLIC_IP=${PUBLIC_IP}" >> $GITHUB_ENV
 
       - name: Execute df Command on EC2
@@ -58,12 +72,19 @@ jobs:
             echo "Running df command on EC2 instance..."
             df -h
             echo "Command completed."
-        # continue-on-error: true  # Allow the workflow to continue even if this step fails
+        continue-on-error: true  # Allow the workflow to continue even if this step fails
+
 
       - name: Terminate EC2 Instance
         run: |
           aws ec2 terminate-instances --instance-ids ${{ env.INSTANCE_ID }}
           aws ec2 wait instance-terminated --instance-ids ${{ env.INSTANCE_ID }}
+        continue-on-error: true  # Allow the workflow to continue even if this step fails
+
+      - name: Release Elastic IP
+        run: |
+          aws ec2 release-address --allocation-id ${{ env.ALLOC_ID }}
+        continue-on-error: true  # Allow the workflow to continue even if this step fails
 
 # jobs:
 #   generate_data_usage_report:

From 743359e824acb4cd6835ac2eec569bc6a0492656 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 10:31:45 -0600
Subject: [PATCH 46/96] Add WIP scripts

---
 .github/scripts/create-file-index.py | 66 ++++++++++++++++++++++++++++
 .github/scripts/produce-report.py    | 66 ++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100755 .github/scripts/create-file-index.py
 create mode 100755 .github/scripts/produce-report.py

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
new file mode 100755
index 00000000..77fe0cd0
--- /dev/null
+++ b/.github/scripts/create-file-index.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+import os
+import time
+import json
+import sys
+import gzip
+from datetime import datetime
+
+def list_files_with_metadata(directory, output_file):
+    # Record the start time
+    start_time = time.time()
+
+    # Get the current date and time for indexing
+    index_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    files_metadata = []
+
+    for root, dirs, files in os.walk(directory):
+        for name in files:
+            filepath = os.path.join(root, name)
+            relative_path = os.path.relpath(filepath, directory)
+
+            try:
+                metadata = {
+                    "path": relative_path,
+                    "size": os.path.getsize(filepath),
+                    "modified": time.ctime(os.path.getmtime(filepath)),
+                    "created": time.ctime(os.path.getctime(filepath))
+                }
+                files_metadata.append(metadata)
+            except (FileNotFoundError, PermissionError) as e:
+                print(f"Skipping {filepath}: {e}")
+
+    # Record the end time and calculate the duration
+    end_time = time.time()
+    duration = end_time - start_time
+
+    # Prepare the output data with additional metadata
+    output_data = {
+        "index_timestamp": index_timestamp,
+        "duration_seconds": duration,
+        "files": files_metadata
+    }
+
+    # Compress and write the output data to a .json.gz file
+    with gzip.open(output_file, "wt", encoding="utf-8") as gz_file:
+        json.dump(output_data, gz_file, indent=4)
+
+    print(f"Indexing completed. Compressed results written to {output_file}")
+
+# Ensure the script is called with the required arguments
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <directory_to_index> <output_json_gz_file>")
+        sys.exit(1)
+
+    directory_to_index = sys.argv[1]
+    output_json_gz_file = sys.argv[2]
+
+    # Ensure the output filename ends with .json.gz for clarity
+    if not output_json_gz_file.endswith(".json.gz"):
+        output_json_gz_file += ".json.gz"
+
+    list_files_with_metadata(directory_to_index, output_json_gz_file)
+
diff --git a/.github/scripts/produce-report.py b/.github/scripts/produce-report.py
new file mode 100755
index 00000000..49debfe0
--- /dev/null
+++ b/.github/scripts/produce-report.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+import os
+import gzip
+import json
+import sys
+from collections import defaultdict
+
+USER_QUOTA = 8_000_000_000
+
+# TODO trash files
+
+def generate_statistics(input_file):
+    # Load the JSON data from the compressed file
+    with gzip.open(input_file, 'rt', encoding='utf-8') as gz_file:
+        data = json.load(gz_file)
+
+    # Dictionary to hold statistics per leading directory
+    stats = {
+        "directories": defaultdict(lambda: {"total_size": 0, "file_count": 0}),
+        "total": {"total_size": 0, "file_count": 0},
+    }
+
+
+    # Process each file's metadata
+    for file_metadata in data["files"]:
+        # Get the leading directory (first part of the relative path)
+        leading_dir = file_metadata["path"].split(os.sep)[0]
+        # TODO trash files
+        # if file_metadata["path"] matches trashglob
+        #  stats["caches"]["<whichone>"] increment file and totalsize count
+        # Update statistics for this leading directory
+        stats["directories"][leading_dir]["file_count"] += 1
+        stats["total"]["file_count"] += 1
+        stats["directories"][leading_dir]["total_size"] += file_metadata["size"]
+        stats["total"]["total_size"] += file_metadata["size"]
+    return stats
+
+def bytes_to_human_readable(size_in_bytes):
+    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
+        if size_in_bytes < 1024:
+            return f"{size_in_bytes:.2f} {unit}"
+        size_in_bytes /= 1024
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <input_json_gz_file>")
+        sys.exit(1)
+
+    input_json_gz_file = sys.argv[1]
+    username = input_json_gz_file.split(".")[0]
+    stats = generate_statistics(input_json_gz_file)
+    human_readable_total = bytes_to_human_readable(stats["total"]["total_size"])
+
+    if stats["total"]["total_size"] < USER_QUOTA:
+        print(f"All ok, user {username} is below quota, consuming {human_readable_total}")
+        sys.exit(0)
+
+    human_readable_quota = bytes_to_human_readable(USER_QUOTA)
+    with open(f"{username}-usage-report.txt", "w") as report:
+        report.write(f"Total usage: {human_readable_total} exceeds quota amount: {human_readable_quota}\n\n")
+        for directory, stat in stats["directories"].items():
+            report.write(f"Directory: {directory}\n")
+            report.write(f"  Total files: {stat['file_count']}\n")
+            report.write(f"  Total size: {bytes_to_human_readable(stat['total_size'])}\n")
+            report.write("\n")

From 0ba12f2ac32bbee17cf6d1758bb780be79e261e5 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 10:32:04 -0600
Subject: [PATCH 47/96] rm old unused

---
 .github/scripts/du.py | 57 -------------------------------------------
 1 file changed, 57 deletions(-)
 delete mode 100755 .github/scripts/du.py

diff --git a/.github/scripts/du.py b/.github/scripts/du.py
deleted file mode 100755
index e6267690..00000000
--- a/.github/scripts/du.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import subprocess
-import sys
-import json
-from datetime import date
-
-OUTPUT_DIR = "/home/asmacdo/du_reports/"
-SIZE_THRESHOLD_GB = 1
-SIZE_THRESHOLD_BYTES = SIZE_THRESHOLD_GB * 1024 * 1024 * 1024
-
-# Function to calculate disk usage of a directory in bytes
-def get_disk_usage_bytes(path):
-    result = subprocess.run(['du', '-sb', path], capture_output=True, text=True)
-    size_str = result.stdout.split()[0]  # Get the size in bytes (du -sb gives size in bytes)
-    return int(size_str)
-
-# Function to convert bytes to a human-readable format (e.g., KB, MB, GB)
-def bytes_to_human_readable(size_in_bytes):
-    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
-        if size_in_bytes < 1024:
-            return f"{size_in_bytes:.2f} {unit}"
-        size_in_bytes /= 1024
-
-def prepare_report(directory):
-    report = {}
-    # List user home dirs in the directory and calculate disk usage
-    for user_dir in os.listdir(directory):
-        user_path = os.path.join(directory, user_dir)
-        if os.path.isdir(user_path):
-            disk_usage_bytes = get_disk_usage_bytes(user_path)
-            report[user_dir] = {
-                "disk_usage_bytes": disk_usage_bytes
-            }
-            if disk_usage_bytes > SIZE_THRESHOLD_BYTES:
-                # TODO: Placeholder for other actions
-                report[user_dir]["action"] = f"Directory size exceeds {SIZE_THRESHOLD_BYTES / (1024**3):.2f}GB, further action taken."
-            else:
-                report[user_dir]["action"] = "No action required."
-
-    for user, data in report.items():
-        data["disk_usage_human_readable"] = bytes_to_human_readable(data["disk_usage_bytes"])
-
-    os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
-    current_date = date.today().strftime('%Y-%m-%d')
-    with open(f"{OUTPUT_DIR}/{current_date}.json", "w") as f:
-        json.dump(report, f, indent=4)
-    print(f"Disk usage report generated at {os.path.abspath(OUTPUT_FILE)}")
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: du.py <directory_to_check>")
-    else:
-        path = sys.argv[1]
-        prepare_report(path)

From 2893ab26d049a1ea704bd902ef6a7856948e02d9 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:02:41 -0600
Subject: [PATCH 48/96] initial commit of scripts

---
 .github/scripts/cleanup-ec2.sh |  4 ++++
 .github/scripts/launch-ec2.sh  | 43 ++++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 .github/scripts/cleanup-ec2.sh
 create mode 100644 .github/scripts/launch-ec2.sh

diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
new file mode 100644
index 00000000..aa54cb6c
--- /dev/null
+++ b/.github/scripts/cleanup-ec2.sh
@@ -0,0 +1,4 @@
+aws ec2 terminate-instances --instance-ids $INSTANCE_ID
+aws ec2 wait instance-terminated --instance-ids $INSTANCE_ID
+
+aws ec2 release-address --allocation-id $ALLOC_ID
diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
new file mode 100644
index 00000000..54ac6e75
--- /dev/null
+++ b/.github/scripts/launch-ec2.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+
+# TODO Test for aws access
+# Set env vars 
+#   aws-region
+#   EC2_SSH_KEY
+#
+export INSTANCE_ID=$(aws ec2 run-instances \
+  --image-id ami-088d38b423bff245f \
+  --count 1 \
+  --instance-type t3.micro \
+  --key-name dandihub-gh-actions \
+  --security-group-ids sg-0bf2dc1c2ff9c122e \
+  --subnet-id subnet-0f544cca61ccd2804 \
+  --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
+  --query 'Instances[0].InstanceId' --output text)
+
+aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID
+
+# allocate elastic (static) IP
+export ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
+
+export EIP=$(aws ec2 associate-address \
+  --instance-id $INSTANCE_ID \
+  --allocation-id $ALLOC_ID \
+  --query 'AssociationId' --output text)
+
+export PUBLIC_IP=$(aws ec2 describe-addresses \
+  --allocation-ids $ALLOC_ID \
+  --query 'Addresses[0].PublicIp' --output text)
+
+# Test: execute df Command on EC2
+        # uses: appleboy/ssh-action@v0.1.6
+        # with:
+        #   host: ${{ env.PUBLIC_IP }}
+        #   username: ec2-user
+        #   key: ${{ secrets.EC2_SSH_KEY }}
+        #   script: |
+        #     echo "Running df command on EC2 instance..."
+        #     df -h
+        #     echo "Command completed."
+        # continue-on-error: true  # Allow the workflow to continue even if this step fails

From 5ef8f801a160225ff35c9df047f8a6d7fec542be Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:04:15 -0600
Subject: [PATCH 49/96] clean up launch script

---
 .github/scripts/launch-ec2.sh | 81 ++++++++++++++++++++++++-----------
 1 file changed, 55 insertions(+), 26 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 54ac6e75..669477d5 100644
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -1,43 +1,72 @@
 #!/usr/bin/env bash
 
+# Check for AWS CLI and credentials
+if ! command -v aws &>/dev/null; then
+  echo "Error: AWS CLI is not installed. Please install it and configure your credentials."
+  exit 1
+fi
 
-# TODO Test for aws access
-# Set env vars 
-#   aws-region
-#   EC2_SSH_KEY
-#
+if ! aws sts get-caller-identity &>/dev/null; then
+  echo "Error: Unable to access AWS. Ensure your credentials are configured correctly."
+  exit 1
+fi
+
+# Set variables
+AWS_REGION="us-east-2" # Update to your AWS region if different
+KEY_NAME="dandihub-gh-actions"
+SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
+SUBNET_ID="subnet-0f544cca61ccd2804"
+AMI_ID="ami-088d38b423bff245f"
+
+# Run EC2 instance
+echo "Launching EC2 instance..."
 export INSTANCE_ID=$(aws ec2 run-instances \
-  --image-id ami-088d38b423bff245f \
+  --image-id $AMI_ID \
   --count 1 \
   --instance-type t3.micro \
-  --key-name dandihub-gh-actions \
-  --security-group-ids sg-0bf2dc1c2ff9c122e \
-  --subnet-id subnet-0f544cca61ccd2804 \
+  --key-name $KEY_NAME \
+  --security-group-ids $SECURITY_GROUP_ID \
+  --subnet-id $SUBNET_ID \
   --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
-  --query 'Instances[0].InstanceId' --output text)
+  --query 'Instances[0].InstanceId' \
+  --output text)
+
+if [ -z "$INSTANCE_ID" ]; then
+  echo "Error: Failed to launch EC2 instance."
+  exit 1
+fi
+
+echo "Instance ID: $INSTANCE_ID"
 
+# Wait for instance to initialize
+echo "Waiting for instance to reach status OK..."
 aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID
 
-# allocate elastic (static) IP
+# Allocate Elastic IP
+echo "Allocating Elastic IP..."
 export ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
 
-export EIP=$(aws ec2 associate-address \
+# Associate Elastic IP with instance
+echo "Associating Elastic IP with instance..."
+export EIP_ASSOC=$(aws ec2 associate-address \
   --instance-id $INSTANCE_ID \
   --allocation-id $ALLOC_ID \
-  --query 'AssociationId' --output text)
+  --query 'AssociationId' \
+  --output text)
 
+if [ -z "$EIP_ASSOC" ]; then
+  echo "Error: Failed to associate Elastic IP."
+  exit 1
+fi
+
+# Get Elastic IP address
 export PUBLIC_IP=$(aws ec2 describe-addresses \
   --allocation-ids $ALLOC_ID \
-  --query 'Addresses[0].PublicIp' --output text)
-
-# Test: execute df Command on EC2
-        # uses: appleboy/ssh-action@v0.1.6
-        # with:
-        #   host: ${{ env.PUBLIC_IP }}
-        #   username: ec2-user
-        #   key: ${{ secrets.EC2_SSH_KEY }}
-        #   script: |
-        #     echo "Running df command on EC2 instance..."
-        #     df -h
-        #     echo "Command completed."
-        # continue-on-error: true  # Allow the workflow to continue even if this step fails
+  --query 'Addresses[0].PublicIp' \
+  --output text)
+
+echo "Elastic IP Address: $PUBLIC_IP"
+
+# Output SSH command for convenience
+echo "To connect to your instance, use:"
+echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"

From b02720eabb73fb93f7779a8694bf9d9c1c9e5cde Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:04:58 -0600
Subject: [PATCH 50/96] make scripe executable

---
 .github/scripts/cleanup-ec2.sh | 0
 .github/scripts/launch-ec2.sh  | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 .github/scripts/cleanup-ec2.sh
 mode change 100644 => 100755 .github/scripts/launch-ec2.sh

diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
old mode 100644
new mode 100755
diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
old mode 100644
new mode 100755

From ae98909501c3c69e6fb4f1b345194913569d9656 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:07:10 -0600
Subject: [PATCH 51/96] fixup cleanup script

---
 .github/scripts/cleanup-ec2.sh | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
index aa54cb6c..8537af57 100755
--- a/.github/scripts/cleanup-ec2.sh
+++ b/.github/scripts/cleanup-ec2.sh
@@ -1,4 +1,36 @@
+#!/usr/bin/env bash
+
+# Ensure required environment variables are set
+if [ -z "$INSTANCE_ID" ]; then
+  echo "Error: INSTANCE_ID is not set. Cannot proceed with cleanup."
+  exit 1
+fi
+
+if [ -z "$ALLOC_ID" ]; then
+  echo "Error: ALLOC_ID is not set. Cannot proceed with cleanup."
+  exit 1
+fi
+
+# Terminate EC2 instance
+echo "Terminating EC2 instance with ID: $INSTANCE_ID"
 aws ec2 terminate-instances --instance-ids $INSTANCE_ID
-aws ec2 wait instance-terminated --instance-ids $INSTANCE_ID
+if [ $? -eq 0 ]; then
+  echo "Instance termination initiated. Waiting for the instance to terminate..."
+  aws ec2 wait instance-terminated --instance-ids $INSTANCE_ID
+  echo "Instance $INSTANCE_ID has been terminated."
+else
+  echo "Error: Failed to terminate instance $INSTANCE_ID."
+  exit 1
+fi
 
+# Release Elastic IP
+echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID"
 aws ec2 release-address --allocation-id $ALLOC_ID
+if [ $? -eq 0 ]; then
+  echo "Elastic IP with Allocation ID $ALLOC_ID has been released."
+else
+  echo "Error: Failed to release Elastic IP with Allocation ID $ALLOC_ID."
+  exit 1
+fi
+
+echo "Cleanup complete."

From 7e80e4a9614705fa13369e4a453a3e830ce61665 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:13:55 -0600
Subject: [PATCH 52/96] add a name to elastic ip (for easier manual cleanup)

---
 .github/scripts/launch-ec2.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 669477d5..e4b2001f 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -44,7 +44,17 @@ aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID
 
 # Allocate Elastic IP
 echo "Allocating Elastic IP..."
-export ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
+export ALLOC_ID=$(aws ec2 allocate-address \
+  --tag-specifications "ResourceType=elastic-ip,Tags=[{Key=Name,Value=dandihub-gh-actions-eip}]" \
+  --query 'AllocationId' \
+  --output text)
+
+if [ -z "$ALLOC_ID" ]; then
+  echo "Error: Failed to allocate Elastic IP."
+  exit 1
+fi
+
+echo "Elastic IP Allocation ID: $ALLOC_ID"
 
 # Associate Elastic IP with instance
 echo "Associating Elastic IP with instance..."

From f2a41164a654838f548c85f11ce5980cb88a99f6 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:21:59 -0600
Subject: [PATCH 53/96] Exit on fail

---
 .github/scripts/launch-ec2.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index e4b2001f..0df9bed0 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+set -e
+
 # Check for AWS CLI and credentials
 if ! command -v aws &>/dev/null; then
   echo "Error: AWS CLI is not installed. Please install it and configure your credentials."

From 6ffef17d05e3cec27c6d2a1804b74d6a35a0d9cc Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:25:41 -0600
Subject: [PATCH 54/96] Add permission for aws ec2 wait instance-status-ok

---
 .aws/terraform-jupyterhub-provisioning-policies.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.aws/terraform-jupyterhub-provisioning-policies.json b/.aws/terraform-jupyterhub-provisioning-policies.json
index e0a6cafb..4c6b8ed5 100644
--- a/.aws/terraform-jupyterhub-provisioning-policies.json
+++ b/.aws/terraform-jupyterhub-provisioning-policies.json
@@ -42,6 +42,7 @@
         "ec2:DescribeAddresses",
         "ec2:DescribeAddressesAttribute",
         "ec2:DescribeAvailabilityZones",
+        "ec2:DescribeInstanceStatus",
         "ec2:DescribeInternetGateways",
         "ec2:DescribeLaunchTemplateVersions",
         "ec2:DescribeLaunchTemplates",

From 20cc085aa586e7f78215602fd71ab36bea1945d7 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 11:40:11 -0600
Subject: [PATCH 55/96] Upload scripts to instance

---
 .github/scripts/launch-ec2.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 0df9bed0..066905d6 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -13,12 +13,15 @@ if ! aws sts get-caller-identity &>/dev/null; then
   exit 1
 fi
 
+
 # Set variables
-AWS_REGION="us-east-2" # Update to your AWS region if different
+AWS_REGION="us-east-2"
 KEY_NAME="dandihub-gh-actions"
 SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
 SUBNET_ID="subnet-0f544cca61ccd2804"
 AMI_ID="ami-088d38b423bff245f"
+LOCAL_SCRIPTS_DIR=".github/scripts"
+REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
 
 # Run EC2 instance
 echo "Launching EC2 instance..."
@@ -79,6 +82,13 @@ export PUBLIC_IP=$(aws ec2 describe-addresses \
 
 echo "Elastic IP Address: $PUBLIC_IP"
 
+
+# Upload scripts to EC2 instance
+echo "Uploading scripts to EC2 instance..."
+scp -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" \
+  $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
+  ec2-user@$PUBLIC_IP:$REMOTE_SCRIPTS_DIR/
+
 # Output SSH command for convenience
 echo "To connect to your instance, use:"
 echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"

From 76477df6d8243c3bf164baa879273b8f717ba2e3 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 6 Dec 2024 14:03:33 -0600
Subject: [PATCH 56/96] explicitly return

---
 .github/scripts/launch-ec2.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 066905d6..99f5b334 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -92,3 +92,4 @@ scp -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" \
 # Output SSH command for convenience
 echo "To connect to your instance, use:"
 echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"
+return 0

From b38ded1cd8c47c5c62d733a6ddd09e1e3f3a2d75 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 11:09:18 -0600
Subject: [PATCH 57/96] output session variables to file

- robust if session is lost
- allow direct execution rather than sourcing
---
 .github/scripts/launch-ec2.sh | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 99f5b334..77fec0a1 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -13,7 +13,6 @@ if ! aws sts get-caller-identity &>/dev/null; then
   exit 1
 fi
 
-
 # Set variables
 AWS_REGION="us-east-2"
 KEY_NAME="dandihub-gh-actions"
@@ -22,6 +21,11 @@ SUBNET_ID="subnet-0f544cca61ccd2804"
 AMI_ID="ami-088d38b423bff245f"
 LOCAL_SCRIPTS_DIR=".github/scripts"
 REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
+ENV_FILE=".ec2-session.env"
+
+# Ensure the environment file is writable
+echo "# Environment variables for EC2 session" > $ENV_FILE
+echo "# Auto-generated by launch script on $(date)" >> $ENV_FILE
 
 # Run EC2 instance
 echo "Launching EC2 instance..."
@@ -40,8 +44,8 @@ if [ -z "$INSTANCE_ID" ]; then
   echo "Error: Failed to launch EC2 instance."
   exit 1
 fi
-
 echo "Instance ID: $INSTANCE_ID"
+echo "export INSTANCE_ID=$INSTANCE_ID" >> $ENV_FILE
 
 # Wait for instance to initialize
 echo "Waiting for instance to reach status OK..."
@@ -58,8 +62,8 @@ if [ -z "$ALLOC_ID" ]; then
   echo "Error: Failed to allocate Elastic IP."
   exit 1
 fi
-
 echo "Elastic IP Allocation ID: $ALLOC_ID"
+echo "export ALLOC_ID=$ALLOC_ID" >> $ENV_FILE
 
 # Associate Elastic IP with instance
 echo "Associating Elastic IP with instance..."
@@ -81,7 +85,7 @@ export PUBLIC_IP=$(aws ec2 describe-addresses \
   --output text)
 
 echo "Elastic IP Address: $PUBLIC_IP"
-
+echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 
 # Upload scripts to EC2 instance
 echo "Uploading scripts to EC2 instance..."
@@ -89,7 +93,16 @@ scp -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" \
   $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
   ec2-user@$PUBLIC_IP:$REMOTE_SCRIPTS_DIR/
 
+if [ $? -eq 0 ]; then
+  echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance."
+else
+  echo "Error: Failed to upload scripts to the instance."
+  exit 1
+fi
+
 # Output SSH command for convenience
 echo "To connect to your instance, use:"
 echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"
-return 0
+
+echo "Environment variables saved to $ENV_FILE."
+echo "Run 'source $ENV_FILE' to restore the environment variables."

From f795570cc6e00217b8158d21e7d9ae6e8e1cb5e1 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 11:12:59 -0600
Subject: [PATCH 58/96] modify cleanup script to retrieve instance from
 temporary file

---
 .github/scripts/cleanup-ec2.sh | 53 +++++++++++++++++++++++++---------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
index 8537af57..e8f732b8 100755
--- a/.github/scripts/cleanup-ec2.sh
+++ b/.github/scripts/cleanup-ec2.sh
@@ -1,5 +1,16 @@
 #!/usr/bin/env bash
 
+set -e
+
+# Load environment variables from the file if they are not already set
+ENV_FILE=".ec2-session.env"
+if [ -f "$ENV_FILE" ]; then
+  echo "Loading environment variables from $ENV_FILE..."
+  source "$ENV_FILE"
+else
+  echo "Warning: Environment file $ENV_FILE not found."
+fi
+
 # Ensure required environment variables are set
 if [ -z "$INSTANCE_ID" ]; then
   echo "Error: INSTANCE_ID is not set. Cannot proceed with cleanup."
@@ -11,26 +22,42 @@ if [ -z "$ALLOC_ID" ]; then
   exit 1
 fi
 
+# Check for AWS CLI and credentials
+if ! command -v aws &>/dev/null; then
+  echo "Error: AWS CLI is not installed. Please install it and configure your credentials."
+  exit 1
+fi
+
+if ! aws sts get-caller-identity &>/dev/null; then
+  echo "Error: Unable to access AWS. Ensure your credentials are configured correctly."
+  exit 1
+fi
+
 # Terminate EC2 instance
-echo "Terminating EC2 instance with ID: $INSTANCE_ID"
-aws ec2 terminate-instances --instance-ids $INSTANCE_ID
-if [ $? -eq 0 ]; then
+echo "Terminating EC2 instance with ID: $INSTANCE_ID..."
+if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID"; then
   echo "Instance termination initiated. Waiting for the instance to terminate..."
-  aws ec2 wait instance-terminated --instance-ids $INSTANCE_ID
-  echo "Instance $INSTANCE_ID has been terminated."
+  if aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID"; then
+    echo "Instance $INSTANCE_ID has been successfully terminated."
+  else
+    echo "Warning: Instance $INSTANCE_ID may not have terminated correctly."
+  fi
 else
-  echo "Error: Failed to terminate instance $INSTANCE_ID."
-  exit 1
+  echo "Warning: Failed to terminate instance $INSTANCE_ID. It may already be terminated."
 fi
 
 # Release Elastic IP
-echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID"
-aws ec2 release-address --allocation-id $ALLOC_ID
-if [ $? -eq 0 ]; then
-  echo "Elastic IP with Allocation ID $ALLOC_ID has been released."
+echo "Releasing Elastic IP with Allocation ID: $ALLOC_ID..."
+if aws ec2 release-address --allocation-id "$ALLOC_ID"; then
+  echo "Elastic IP with Allocation ID $ALLOC_ID has been successfully released."
 else
-  echo "Error: Failed to release Elastic IP with Allocation ID $ALLOC_ID."
-  exit 1
+  echo "Warning: Failed to release Elastic IP with Allocation ID $ALLOC_ID. It may already be released."
+fi
+
+# Cleanup environment file
+if [ -f "$ENV_FILE" ]; then
+  echo "Removing environment file $ENV_FILE..."
+  rm -f "$ENV_FILE"
 fi
 
 echo "Cleanup complete."

From f8a92b2d64909f4ff6e93b6f36c5c1791e7c5588 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 11:28:07 -0600
Subject: [PATCH 59/96] All ec2 persmissions granted

We already give most permissions, and occassionally need to add more.
---
 ...form-jupyterhub-provisioning-policies.json | 65 +------------------
 1 file changed, 1 insertion(+), 64 deletions(-)

diff --git a/.aws/terraform-jupyterhub-provisioning-policies.json b/.aws/terraform-jupyterhub-provisioning-policies.json
index 4c6b8ed5..35103551 100644
--- a/.aws/terraform-jupyterhub-provisioning-policies.json
+++ b/.aws/terraform-jupyterhub-provisioning-policies.json
@@ -4,70 +4,7 @@
     {
       "Effect": "Allow",
       "Action": [
-        "ec2:AllocateAddress",
-        "ec2:AssociateAddress",
-        "ec2:AssociateRouteTable",
-        "ec2:AssociateVpcCidrBlock",
-        "ec2:AttachInternetGateway",
-        "ec2:AttachNetworkInterface",
-        "ec2:AuthorizeSecurityGroupEgress",
-        "ec2:AuthorizeSecurityGroupIngress",
-        "ec2:CreateInternetGateway",
-        "ec2:CreateLaunchTemplate",
-        "ec2:CreateLaunchTemplateVersion",
-        "ec2:CreateNatGateway",
-        "ec2:CreateNetworkAcl",
-        "ec2:CreateNetworkAclEntry",
-        "ec2:CreateNetworkInterface",
-        "ec2:CreateNetworkInterfacePermission",
-        "ec2:CreateRoute",
-        "ec2:CreateRouteTable",
-        "ec2:CreateSecurityGroup",
-        "ec2:CreateSubnet",
-        "ec2:CreateTags",
-        "ec2:CreateVpc",
-        "ec2:DeleteInternetGateway",
-        "ec2:DeleteLaunchTemplate",
-        "ec2:DeleteLaunchTemplateVersions",
-        "ec2:DeleteNatGateway",
-        "ec2:DeleteNetworkAcl",
-        "ec2:DeleteNetworkAclEntry",
-        "ec2:DeleteNetworkInterface",
-        "ec2:DeleteRoute",
-        "ec2:DeleteRouteTable",
-        "ec2:DeleteSecurityGroup",
-        "ec2:DeleteSubnet",
-        "ec2:DeleteTags",
-        "ec2:DeleteVpc",
-        "ec2:DescribeAddresses",
-        "ec2:DescribeAddressesAttribute",
-        "ec2:DescribeAvailabilityZones",
-        "ec2:DescribeInstanceStatus",
-        "ec2:DescribeInternetGateways",
-        "ec2:DescribeLaunchTemplateVersions",
-        "ec2:DescribeLaunchTemplates",
-        "ec2:DescribeNatGateways",
-        "ec2:DescribeNetworkAcls",
-        "ec2:DescribeNetworkInterfacePermissions",
-        "ec2:DescribeNetworkInterfaces",
-        "ec2:DescribeRouteTables",
-        "ec2:DescribeSecurityGroupRules",
-        "ec2:DescribeSecurityGroups",
-        "ec2:DescribeSubnets",
-        "ec2:DescribeVpcAttribute",
-        "ec2:DescribeVpcs",
-        "ec2:DetachInternetGateway",
-        "ec2:DetachNetworkInterface",
-        "ec2:DisassociateAddress",
-        "ec2:DisassociateRouteTable",
-        "ec2:DisassociateVpcCidrBlock",
-        "ec2:ModifyNetworkInterfaceAttribute",
-        "ec2:ModifyVpcAttribute",
-        "ec2:ReleaseAddress",
-        "ec2:ReplaceRoute",
-        "ec2:RevokeSecurityGroupEgress",
-        "ec2:RevokeSecurityGroupIngress",
-        "ec2:RunInstances",
+        "ec2:*",
         "ecr-public:GetAuthorizationToken",
         "eks:*",
         "elasticfilesystem:CreateFileSystem",

From e9726df2c31773344dc48e26c9b0318dfc8be402 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 12:08:36 -0600
Subject: [PATCH 60/96] Add EFS mount (hardcoded)

---
 .github/scripts/launch-ec2.sh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 77fec0a1..0673a5ab 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -19,8 +19,10 @@ KEY_NAME="dandihub-gh-actions"
 SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
 SUBNET_ID="subnet-0f544cca61ccd2804"
 AMI_ID="ami-088d38b423bff245f"
+EFS_ID="fs-02aac16c4c6c2dc27"
 LOCAL_SCRIPTS_DIR=".github/scripts"
 REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
+MOUNT_POINT="/mnt/efs"
 ENV_FILE=".ec2-session.env"
 
 # Ensure the environment file is writable
@@ -100,6 +102,16 @@ else
   exit 1
 fi
 
+# Mount EFS on the EC2 instance
+echo "Mounting EFS on the EC2 instance..."
+ssh -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" ec2-user@$PUBLIC_IP <<EOF
+  sudo yum install -y amazon-efs-utils
+  sudo mkdir -p $MOUNT_POINT
+  sudo mount -t efs $EFS_ID:/ $MOUNT_POINT
+  echo "$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0" | sudo tee -a /etc/fstab
+  echo "EFS mounted at $MOUNT_POINT"
+EOF
+
 # Output SSH command for convenience
 echo "To connect to your instance, use:"
 echo "ssh -i \$EC2_SSH_KEY ec2-user@$PUBLIC_IP"

From c6e92f9087a3663f4de09be6a44dd7d5655b3686 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 12:16:28 -0600
Subject: [PATCH 61/96] No pager for termination

---
 .github/scripts/cleanup-ec2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/cleanup-ec2.sh b/.github/scripts/cleanup-ec2.sh
index e8f732b8..a790d927 100755
--- a/.github/scripts/cleanup-ec2.sh
+++ b/.github/scripts/cleanup-ec2.sh
@@ -35,7 +35,7 @@ fi
 
 # Terminate EC2 instance
 echo "Terminating EC2 instance with ID: $INSTANCE_ID..."
-if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID"; then
+if aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" --no-cli-pager; then
   echo "Instance termination initiated. Waiting for the instance to terminate..."
   if aws ec2 wait instance-terminated --instance-ids "$INSTANCE_ID"; then
     echo "Instance $INSTANCE_ID has been successfully terminated."

From 17d77cdedde45fe44553626c2ae820cc5b3ad2f1 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 12:21:07 -0600
Subject: [PATCH 62/96] force pseudo-terminal, otherwise hangs after yum
 install

---
 .github/scripts/launch-ec2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 0673a5ab..a21c9c4d 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -91,7 +91,7 @@ echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 
 # Upload scripts to EC2 instance
 echo "Uploading scripts to EC2 instance..."
-scp -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" \
+scp -t -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" \
   $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
   ec2-user@$PUBLIC_IP:$REMOTE_SCRIPTS_DIR/
 

From 2246af53a84b5636fc8a27ee648409366969bca8 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 12:37:54 -0600
Subject: [PATCH 63/96] Add doublequotes to variable usage for proper expansion

---
 .github/scripts/launch-ec2.sh | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index a21c9c4d..d98a8140 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -51,7 +51,7 @@ echo "export INSTANCE_ID=$INSTANCE_ID" >> $ENV_FILE
 
 # Wait for instance to initialize
 echo "Waiting for instance to reach status OK..."
-aws ec2 wait instance-status-ok --instance-ids $INSTANCE_ID
+aws ec2 wait instance-status-ok --instance-ids "$INSTANCE_ID"
 
 # Allocate Elastic IP
 echo "Allocating Elastic IP..."
@@ -70,8 +70,8 @@ echo "export ALLOC_ID=$ALLOC_ID" >> $ENV_FILE
 # Associate Elastic IP with instance
 echo "Associating Elastic IP with instance..."
 export EIP_ASSOC=$(aws ec2 associate-address \
-  --instance-id $INSTANCE_ID \
-  --allocation-id $ALLOC_ID \
+  --instance-id "$INSTANCE_ID" \
+  --allocation-id "$ALLOC_ID" \
   --query 'AssociationId' \
   --output text)
 
@@ -82,7 +82,7 @@ fi
 
 # Get Elastic IP address
 export PUBLIC_IP=$(aws ec2 describe-addresses \
-  --allocation-ids $ALLOC_ID \
+  --allocation-ids "$ALLOC_ID" \
   --query 'Addresses[0].PublicIp' \
   --output text)
 
@@ -91,9 +91,9 @@ echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 
 # Upload scripts to EC2 instance
 echo "Uploading scripts to EC2 instance..."
-scp -t -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" \
+scp -t -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
   $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
-  ec2-user@$PUBLIC_IP:$REMOTE_SCRIPTS_DIR/
+  ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
 
 if [ $? -eq 0 ]; then
   echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance."
@@ -104,7 +104,7 @@ fi
 
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
-ssh -i $EC2_SSH_KEY -o "StrictHostKeyChecking=no" ec2-user@$PUBLIC_IP <<EOF
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@$PUBLIC_IP <<EOF
   sudo yum install -y amazon-efs-utils
   sudo mkdir -p $MOUNT_POINT
   sudo mount -t efs $EFS_ID:/ $MOUNT_POINT

From b49b7b56d096aea8dacf2a7d48147414bffcaf0b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 12:51:30 -0600
Subject: [PATCH 64/96] Fixup -t goes on ssh, not scp

---
 .github/scripts/launch-ec2.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index d98a8140..412559f7 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -91,7 +91,7 @@ echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 
 # Upload scripts to EC2 instance
 echo "Uploading scripts to EC2 instance..."
-scp -t -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
+scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
   $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
   ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
 
@@ -104,7 +104,7 @@ fi
 
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
-ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@$PUBLIC_IP <<EOF
+ssh -t -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@$PUBLIC_IP <<EOF
   sudo yum install -y amazon-efs-utils
   sudo mkdir -p $MOUNT_POINT
   sudo mount -t efs $EFS_ID:/ $MOUNT_POINT

From 584ac4dbf2f68f8fe43983a5e75ad4134bf4e74d Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 13:06:18 -0600
Subject: [PATCH 65/96] Mount as a single command, since we dont have access to
 pty

---
 .github/scripts/launch-ec2.sh | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 412559f7..27b74392 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -104,13 +104,12 @@ fi
 
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
-ssh -t -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@$PUBLIC_IP <<EOF
-  sudo yum install -y amazon-efs-utils
-  sudo mkdir -p $MOUNT_POINT
-  sudo mount -t efs $EFS_ID:/ $MOUNT_POINT
-  echo "$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0" | sudo tee -a /etc/fstab
-  echo "EFS mounted at $MOUNT_POINT"
-EOF
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+  "sudo yum install -y amazon-efs-utils && \
+   sudo mkdir -p $MOUNT_POINT && \
+   sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \
+   echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \
+   echo 'EFS mounted at $MOUNT_POINT'"
 
 # Output SSH command for convenience
 echo "To connect to your instance, use:"

From 4a700e59708b274f052af8533fd2b4828468e8f1 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 15:37:00 -0600
Subject: [PATCH 66/96] add todos for manual steps

---
 .github/scripts/launch-ec2.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 27b74392..249ea980 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -15,8 +15,12 @@ fi
 
 # Set variables
 AWS_REGION="us-east-2"
+# TODO document that this key needs to be created
 KEY_NAME="dandihub-gh-actions"
+# TODO create if DNE
+# allow gh-actions to ssh into ec2 job instance from anywhere
 SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
+# TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a)
 SUBNET_ID="subnet-0f544cca61ccd2804"
 AMI_ID="ami-088d38b423bff245f"
 EFS_ID="fs-02aac16c4c6c2dc27"
@@ -102,6 +106,16 @@ else
   exit 1
 fi
 
+# TODO automate
+# eks-dandihub-efs sg is created by dandi-hub install
+# this sg needs to accept incoming 2049 from the sg created for this ec2
+# sg-061d875722e569724 - eks-dandihub-efs
+# aws ec2 authorize-security-group-ingress \
+#   --group-id sg-061d875722e569724 \
+#   --protocol tcp \
+#   --port 2049 \
+#   --source-group $SECURITY_GROUP_ID
+
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
 ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \

From 63399248d5fc9cdeef8ac351ad90cf9ddfff0271 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Wed, 11 Dec 2024 15:50:28 -0600
Subject: [PATCH 67/96] Disable job for now

Currently script must be used manually to provision ec2.
ssh into the provided instance, and then execute 2 python scripts
---
 .github/workflows/report.yaml | 318 +++++++++++++++++-----------------
 1 file changed, 159 insertions(+), 159 deletions(-)

diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
index 67f4f6ef..84e1356d 100644
--- a/.github/workflows/report.yaml
+++ b/.github/workflows/report.yaml
@@ -1,183 +1,183 @@
----
-name: Generate Data Usage Report
-
-on:
-  pull_request:
-    branches:
-      - main
-
-
-jobs:
-  generate-jobs-usage-report:
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v3
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-
-      - name: Launch EC2 Instance
-        id: launch_ec2
-        run: |
-          INSTANCE_ID=$(aws ec2 run-instances \
-            --image-id ami-088d38b423bff245f \
-            --count 1 \
-            --instance-type t3.micro \
-            --key-name dandihub-gh-actions \
-            --security-group-ids sg-0bf2dc1c2ff9c122e \
-            --subnet-id subnet-0f544cca61ccd2804 \
-            --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
-            --query 'Instances[0].InstanceId' --output text)
-
-          echo "INSTANCE_ID=${INSTANCE_ID}" >> $GITHUB_ENV
-
-      - name: Wait for EC2 to Initialize
-        run: |
-          aws ec2 wait instance-status-ok --instance-ids ${{ env.INSTANCE_ID }}
-
-      - name: Allocate Elastic IP
-        id: allocate_eip
-        run: |
-          ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
-          echo "ALLOC_ID=${ALLOC_ID}" >> $GITHUB_ENV
-
-      - name: Associate Elastic IP with EC2 Instance
-        id: associate_eip
-        run: |
-          EIP=$(aws ec2 associate-address \
-            --instance-id ${{ env.INSTANCE_ID }} \
-            --allocation-id ${{ env.ALLOC_ID }} \
-            --query 'AssociationId' --output text)
-          echo "EIP=${EIP}" >> $GITHUB_ENV
-
-      - name: Retrieve Elastic IP Address
-        id: get_ip
-        run: |
-          PUBLIC_IP=$(aws ec2 describe-addresses \
-            --allocation-ids ${{ env.ALLOC_ID }} \
-            --query 'Addresses[0].PublicIp' --output text)
-          echo "PUBLIC_IP=${PUBLIC_IP}"
-          echo "PUBLIC_IP=${PUBLIC_IP}" >> $GITHUB_ENV
-
-      - name: Execute df Command on EC2
-        uses: appleboy/ssh-action@v0.1.6
-        with:
-          host: ${{ env.PUBLIC_IP }}
-          username: ec2-user
-          key: ${{ secrets.EC2_SSH_KEY }}
-          script: |
-            echo "Running df command on EC2 instance..."
-            df -h
-            echo "Command completed."
-        continue-on-error: true  # Allow the workflow to continue even if this step fails
-
-
-      - name: Terminate EC2 Instance
-        run: |
-          aws ec2 terminate-instances --instance-ids ${{ env.INSTANCE_ID }}
-          aws ec2 wait instance-terminated --instance-ids ${{ env.INSTANCE_ID }}
-        continue-on-error: true  # Allow the workflow to continue even if this step fails
-
-      - name: Release Elastic IP
-        run: |
-          aws ec2 release-address --allocation-id ${{ env.ALLOC_ID }}
-        continue-on-error: true  # Allow the workflow to continue even if this step fails
-
-# jobs:
-#   generate_data_usage_report:
-#     runs-on: ubuntu-latest
+# ---
+# name: Generate Data Usage Report
 #
-#     steps:
+# on:
+#   pull_request:
+#     branches:
+#       - main
 #
-#       - name: Log in to DockerHub
-#         uses: docker/login-action@v2
-#         with:
-#           username: ${{ secrets.DOCKERHUB_USERNAME }}
-#           password: ${{ secrets.DOCKERHUB_TOKEN }}
 #
-#       - name: Build and push Docker image
-#         uses: docker/build-push-action@v3
-#         with:
-#           context: .
-#           file: images/Dockerfile.dandihub_report_generator
-#           push: true
-#           tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
+# jobs:
+#   generate-jobs-usage-report:
+#     runs-on: ubuntu-latest
 #
-#       - name: Configure AWS credentials
+#     steps:
+#       - name: Configure AWS Credentials
 #         uses: aws-actions/configure-aws-credentials@v3
 #         with:
 #           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
 #           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
 #           aws-region: us-east-2
 #
-#       - name: Assume ProvisioningRole
+#       - name: Launch EC2 Instance
+#         id: launch_ec2
 #         run: |
-#           CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
-#           export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
-#           export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
-#           export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+#           INSTANCE_ID=$(aws ec2 run-instances \
+#             --image-id ami-088d38b423bff245f \
+#             --count 1 \
+#             --instance-type t3.micro \
+#             --key-name dandihub-gh-actions \
+#             --security-group-ids sg-0bf2dc1c2ff9c122e \
+#             --subnet-id subnet-0f544cca61ccd2804 \
+#             --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
+#             --query 'Instances[0].InstanceId' --output text)
+#
+#           echo "INSTANCE_ID=${INSTANCE_ID}" >> $GITHUB_ENV
 #
-#       - name: Configure kubectl with AWS EKS
+#       - name: Wait for EC2 to Initialize
 #         run: |
-#           aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
+#           aws ec2 wait instance-status-ok --instance-ids ${{ env.INSTANCE_ID }}
 #
-#       # TODO remove
-#       - name: Sanity check
+#       - name: Allocate Elastic IP
+#         id: allocate_eip
 #         run: |
-#           kubectl get pods -n jupyterhub
+#           ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
+#           echo "ALLOC_ID=${ALLOC_ID}" >> $GITHUB_ENV
 #
-#       # - name: Deploy Hello World Pod
-#       #   run: |
-#       #     kubectl apply -f .github/manifests/hello-world-pod.yaml
-#       #
-#       # - name: Wait for Hello World Pod to complete
-#       #   run: |
-#       #     kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
-#       #   continue-on-error: true  # Allow the workflow to continue even if this step fails
-#       #
-#       # - name: Get Hello World Pod logs
-#       #   run: |
-#       #     kubectl logs hello-world-pod
-#       #   if: ${{ success() }}  # Only run this step if the previous step was successful
-#       #
-#       # - name: Delete Hello World Pod
-#       #   run: |
-#       #     kubectl delete pod hello-world-pod
-#       #   if: ${{ always() }}  # Always run this step, even if other steps fail
-#       #
-#       - name: Replace image placeholder in manifest
+#       - name: Associate Elastic IP with EC2 Instance
+#         id: associate_eip
 #         run: |
-#           sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
+#           EIP=$(aws ec2 associate-address \
+#             --instance-id ${{ env.INSTANCE_ID }} \
+#             --allocation-id ${{ env.ALLOC_ID }} \
+#             --query 'AssociationId' --output text)
+#           echo "EIP=${EIP}" >> $GITHUB_ENV
 #
-#       - name: Deploy Disk Usage Report Job
+#       - name: Retrieve Elastic IP Address
+#         id: get_ip
 #         run: |
-#           kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+#           PUBLIC_IP=$(aws ec2 describe-addresses \
+#             --allocation-ids ${{ env.ALLOC_ID }} \
+#             --query 'Addresses[0].PublicIp' --output text)
+#           echo "PUBLIC_IP=${PUBLIC_IP}"
+#           echo "PUBLIC_IP=${PUBLIC_IP}" >> $GITHUB_ENV
+#
+#       - name: Execute df Command on EC2
+#         uses: appleboy/ssh-action@v0.1.6
+#         with:
+#           host: ${{ env.PUBLIC_IP }}
+#           username: ec2-user
+#           key: ${{ secrets.EC2_SSH_KEY }}
+#           script: |
+#             echo "Running df command on EC2 instance..."
+#             df -h
+#             echo "Command completed."
+#         continue-on-error: true  # Allow the workflow to continue even if this step fails
+#
 #
-#       # TODO should timeout be longer?
-#       - name: Wait for Disk Usage Report Job to complete
+#       - name: Terminate EC2 Instance
 #         run: |
-#           kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
-#         continue-on-error: true
+#           aws ec2 terminate-instances --instance-ids ${{ env.INSTANCE_ID }}
+#           aws ec2 wait instance-terminated --instance-ids ${{ env.INSTANCE_ID }}
+#         continue-on-error: true  # Allow the workflow to continue even if this step fails
 #
-#       # continue-on-error for previous steps so we delete the job
-#       - name: Delete Disk Usage Report Job
+#       - name: Release Elastic IP
 #         run: |
-#           kubectl delete job disk-usage-report-job -n jupyterhub
+#           aws ec2 release-address --allocation-id ${{ env.ALLOC_ID }}
+#         continue-on-error: true  # Allow the workflow to continue even if this step fails
 #
-#       # - name: Clone dandi-hub-usage-reports repository
-#       #   run: |
-#       #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
-#       #
-#       # - name: Copy report file to repository, commit and push report
-#       #   run: |
-#       #     cd dandi-hub-usage-reports
-#       #     DATE=$(date +'%Y-%m-%d')
-#       #     mv ../du_report.json $DATE_du_report.json
-#       #     git config --global user.name "GitHub Actions"
-#       #     git config --global user.email "actions@github.com"
-#       #     git add $DATE_du_report.json
-#       #     git commit -m "Add disk usage report for $DATE"
-#       #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git
+# # jobs:
+# #   generate_data_usage_report:
+# #     runs-on: ubuntu-latest
+# #
+# #     steps:
+# #
+# #       - name: Log in to DockerHub
+# #         uses: docker/login-action@v2
+# #         with:
+# #           username: ${{ secrets.DOCKERHUB_USERNAME }}
+# #           password: ${{ secrets.DOCKERHUB_TOKEN }}
+# #
+# #       - name: Build and push Docker image
+# #         uses: docker/build-push-action@v3
+# #         with:
+# #           context: .
+# #           file: images/Dockerfile.dandihub_report_generator
+# #           push: true
+# #           tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
+# #
+# #       - name: Configure AWS credentials
+# #         uses: aws-actions/configure-aws-credentials@v3
+# #         with:
+# #           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+# #           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+# #           aws-region: us-east-2
+# #
+# #       - name: Assume ProvisioningRole
+# #         run: |
+# #           CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
+# #           export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
+# #           export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
+# #           export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
+# #
+# #       - name: Configure kubectl with AWS EKS
+# #         run: |
+# #           aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
+# #
+# #       # TODO remove
+# #       - name: Sanity check
+# #         run: |
+# #           kubectl get pods -n jupyterhub
+# #
+# #       # - name: Deploy Hello World Pod
+# #       #   run: |
+# #       #     kubectl apply -f .github/manifests/hello-world-pod.yaml
+# #       #
+# #       # - name: Wait for Hello World Pod to complete
+# #       #   run: |
+# #       #     kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
+# #       #   continue-on-error: true  # Allow the workflow to continue even if this step fails
+# #       #
+# #       # - name: Get Hello World Pod logs
+# #       #   run: |
+# #       #     kubectl logs hello-world-pod
+# #       #   if: ${{ success() }}  # Only run this step if the previous step was successful
+# #       #
+# #       # - name: Delete Hello World Pod
+# #       #   run: |
+# #       #     kubectl delete pod hello-world-pod
+# #       #   if: ${{ always() }}  # Always run this step, even if other steps fail
+# #       #
+# #       - name: Replace image placeholder in manifest
+# #         run: |
+# #           sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
+# #
+# #       - name: Deploy Disk Usage Report Job
+# #         run: |
+# #           kubectl apply -f .github/manifests/disk-usage-report-job.yaml
+# #
+# #       # TODO should timeout be longer?
+# #       - name: Wait for Disk Usage Report Job to complete
+# #         run: |
+# #           kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
+# #         continue-on-error: true
+# #
+# #       # continue-on-error for previous steps so we delete the job
+# #       - name: Delete Disk Usage Report Job
+# #         run: |
+# #           kubectl delete job disk-usage-report-job -n jupyterhub
+# #
+# #       # - name: Clone dandi-hub-usage-reports repository
+# #       #   run: |
+# #       #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
+# #       #
+# #       # - name: Copy report file to repository, commit and push report
+# #       #   run: |
+# #       #     cd dandi-hub-usage-reports
+# #       #     DATE=$(date +'%Y-%m-%d')
+# #       #     mv ../du_report.json $DATE_du_report.json
+# #       #     git config --global user.name "GitHub Actions"
+# #       #     git config --global user.email "actions@github.com"
+# #       #     git add $DATE_du_report.json
+# #       #     git commit -m "Add disk usage report for $DATE"
+# #       #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git

From 17130ef88fdf692bc35ea9306db4fa962aa6578b Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Thu, 12 Dec 2024 16:45:51 -0600
Subject: [PATCH 68/96] Update AMI to ubuntu

Using ubuntu for more modern python (3.12 vs 3.7 with previous or 3.9 with another amazon linux)
---
 .github/scripts/launch-ec2.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 249ea980..e374d42c 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -22,12 +22,13 @@ KEY_NAME="dandihub-gh-actions"
 SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
 # TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a)
 SUBNET_ID="subnet-0f544cca61ccd2804"
-AMI_ID="ami-088d38b423bff245f"
+AMI_ID="ami-036841078a4b68e14"
 EFS_ID="fs-02aac16c4c6c2dc27"
 LOCAL_SCRIPTS_DIR=".github/scripts"
 REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
 MOUNT_POINT="/mnt/efs"
 ENV_FILE=".ec2-session.env"
+EC2_USER="ubuntu"
 
 # Ensure the environment file is writable
 echo "# Environment variables for EC2 session" > $ENV_FILE
@@ -97,7 +98,7 @@ echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 echo "Uploading scripts to EC2 instance..."
 scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
   $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
-  ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
+  $EC2_USER@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
 
 if [ $? -eq 0 ]; then
   echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance."
@@ -118,7 +119,7 @@ fi
 
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
-ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" $EC2_USER@"$PUBLIC_IP" \
   "sudo yum install -y amazon-efs-utils && \
    sudo mkdir -p $MOUNT_POINT && \
    sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \

From cc29df52b34fa12706065d5c79ae271badcbd000 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Thu, 12 Dec 2024 16:58:54 -0600
Subject: [PATCH 69/96] Roll back to AL 2023

Python 3.9 is new enough. Ubuntu was not ideal because the EFS mounting
require amazonl-efs-utils which is available via package manage for amazon linux,
but requires building from source on Ubuntu.
---
 .github/scripts/launch-ec2.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index e374d42c..1c3cd615 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -22,13 +22,12 @@ KEY_NAME="dandihub-gh-actions"
 SECURITY_GROUP_ID="sg-0bf2dc1c2ff9c122e"
 # TODO retrieve subnet id (public, created by dandi-hub eks-dandihub-public-us-east-2a)
 SUBNET_ID="subnet-0f544cca61ccd2804"
-AMI_ID="ami-036841078a4b68e14"
+AMI_ID="ami-0c80e2b6ccb9ad6d1"
 EFS_ID="fs-02aac16c4c6c2dc27"
 LOCAL_SCRIPTS_DIR=".github/scripts"
 REMOTE_SCRIPTS_DIR="/home/ec2-user/scripts"
 MOUNT_POINT="/mnt/efs"
 ENV_FILE=".ec2-session.env"
-EC2_USER="ubuntu"
 
 # Ensure the environment file is writable
 echo "# Environment variables for EC2 session" > $ENV_FILE
@@ -98,7 +97,7 @@ echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 echo "Uploading scripts to EC2 instance..."
 scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
   $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
-  $EC2_USER@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
+  ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
 
 if [ $? -eq 0 ]; then
   echo "Scripts uploaded successfully to $REMOTE_SCRIPTS_DIR on the instance."
@@ -119,7 +118,7 @@ fi
 
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
-ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" $EC2_USER@"$PUBLIC_IP" \
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
   "sudo yum install -y amazon-efs-utils && \
    sudo mkdir -p $MOUNT_POINT && \
    sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \

From 295361cf4d4ea299ad1470fec4c4e32d3b458eca Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 11:18:17 -0600
Subject: [PATCH 70/96] drop gzip, just write json

gzip will only be needed if we want to upload to s3
---
 .github/scripts/create-file-index.py | 20 +++++++++-----------
 .github/scripts/produce-report.py    | 12 ++++++------
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index 77fe0cd0..67fa339c 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -4,7 +4,6 @@
 import time
 import json
 import sys
-import gzip
 from datetime import datetime
 
 def list_files_with_metadata(directory, output_file):
@@ -43,24 +42,23 @@ def list_files_with_metadata(directory, output_file):
         "files": files_metadata
     }
 
-    # Compress and write the output data to a .json.gz file
-    with gzip.open(output_file, "wt", encoding="utf-8") as gz_file:
-        json.dump(output_data, gz_file, indent=4)
+    # Write the output data to a .json file
+    with open(output_file, "w", encoding="utf-8") as json_file:
+        json.dump(output_data, json_file, indent=4)
 
     print(f"Indexing completed. Compressed results written to {output_file}")
 
 # Ensure the script is called with the required arguments
 if __name__ == "__main__":
     if len(sys.argv) != 3:
-        print("Usage: python script.py <directory_to_index> <output_json_gz_file>")
+        print("Usage: python script.py <directory_to_index> <output_json_file>")
         sys.exit(1)
 
     directory_to_index = sys.argv[1]
-    output_json_gz_file = sys.argv[2]
+    output_json_file = sys.argv[2]
 
-    # Ensure the output filename ends with .json.gz for clarity
-    if not output_json_gz_file.endswith(".json.gz"):
-        output_json_gz_file += ".json.gz"
-
-    list_files_with_metadata(directory_to_index, output_json_gz_file)
+    # Ensure the output filename ends with .json for clarity
+    if not output_json_file.endswith(".json"):
+        output_json_file += ".json"
 
+    list_files_with_metadata(directory_to_index, output_json_file)
diff --git a/.github/scripts/produce-report.py b/.github/scripts/produce-report.py
index 49debfe0..9daca426 100755
--- a/.github/scripts/produce-report.py
+++ b/.github/scripts/produce-report.py
@@ -12,8 +12,8 @@
 
 def generate_statistics(input_file):
     # Load the JSON data from the compressed file
-    with gzip.open(input_file, 'rt', encoding='utf-8') as gz_file:
-        data = json.load(gz_file)
+    with open(input_file, 'r', encoding='utf-8') as json_file:
+        data = json.load(json_file)
 
     # Dictionary to hold statistics per leading directory
     stats = {
@@ -44,12 +44,12 @@ def bytes_to_human_readable(size_in_bytes):
 
 if __name__ == "__main__":
     if len(sys.argv) != 2:
-        print("Usage: python script.py <input_json_gz_file>")
+        print("Usage: python script.py <input_json_file>")
         sys.exit(1)
 
-    input_json_gz_file = sys.argv[1]
-    username = input_json_gz_file.split(".")[0]
-    stats = generate_statistics(input_json_gz_file)
+    input_json_file = sys.argv[1]
+    username = input_json_file.split(".")[0]
+    stats = generate_statistics(input_json_file)
     human_readable_total = bytes_to_human_readable(stats["total"]["total_size"])
 
     if stats["total"]["total_size"] < USER_QUOTA:

From a667c047830e229fbf15921f7ddf371d0696ab37 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 11:20:55 -0600
Subject: [PATCH 71/96] include target dir in relative paths

---
 .github/scripts/create-file-index.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index 67fa339c..9c480f9f 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -18,11 +18,9 @@ def list_files_with_metadata(directory, output_file):
     for root, dirs, files in os.walk(directory):
         for name in files:
             filepath = os.path.join(root, name)
-            relative_path = os.path.relpath(filepath, directory)
-
             try:
                 metadata = {
-                    "path": relative_path,
+                    "path": filepath,
                     "size": os.path.getsize(filepath),
                     "modified": time.ctime(os.path.getmtime(filepath)),
                     "created": time.ctime(os.path.getctime(filepath))

From a91beb0b6da0aa7cd76a1387ebf2b3b65bca6e48 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 11:32:59 -0600
Subject: [PATCH 72/96] Second script will not produce user report, but
 directory stats json

---
 .../scripts/{produce-report.py => calculate-directory-stats.py}   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename .github/scripts/{produce-report.py => calculate-directory-stats.py} (100%)

diff --git a/.github/scripts/produce-report.py b/.github/scripts/calculate-directory-stats.py
similarity index 100%
rename from .github/scripts/produce-report.py
rename to .github/scripts/calculate-directory-stats.py

From 9371982d92cb1c0b9a2681c24789816e8a0e4b34 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 15:41:22 -0600
Subject: [PATCH 73/96] inital algorithm hackout

---
 .github/scripts/calculate-directory-stats.py | 86 ++++++++++++--------
 1 file changed, 52 insertions(+), 34 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 9daca426..1fab7e93 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -10,37 +10,66 @@
 
 # TODO trash files
 
+
+def propagate_dir(stats, highest_common, dir_list, prev_dir):
+    while dir_list:
+        working_dir = os.path.join(highest_common, *dir_list)
+        stats[working_dir]['file_count'] += stats[prev_dir]['file_count']
+        dir_list.pop()
+        prev_dir = working_dir
+    stats[highest_common]['file_count'] += stats[prev_dir]['file_count']
+
 def generate_statistics(input_file):
     # Load the JSON data from the compressed file
     with open(input_file, 'r', encoding='utf-8') as json_file:
         data = json.load(json_file)
 
     # Dictionary to hold statistics per leading directory
-    stats = {
-        "directories": defaultdict(lambda: {"total_size": 0, "file_count": 0}),
-        "total": {"total_size": 0, "file_count": 0},
-    }
+    stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
 
 
-    # Process each file's metadata
+    # Assumes dirs are listed depth first (files are listed prior to directories)
+    previous_parent = ""
     for file_metadata in data["files"]:
-        # Get the leading directory (first part of the relative path)
-        leading_dir = file_metadata["path"].split(os.sep)[0]
-        # TODO trash files
-        # if file_metadata["path"] matches trashglob
-        #  stats["caches"]["<whichone>"] increment file and totalsize count
-        # Update statistics for this leading directory
-        stats["directories"][leading_dir]["file_count"] += 1
-        stats["total"]["file_count"] += 1
-        stats["directories"][leading_dir]["total_size"] += file_metadata["size"]
-        stats["total"]["total_size"] += file_metadata["size"]
+        print(f"Calculating {file_metadata['path']}")
+        this_parent = os.path.dirname(file_metadata["path"])
+        stats[this_parent]["file_count"] += 1
+
+        if previous_parent == this_parent:
+            continue
+        # going deeper
+        # TODO account for going multiple levels deeper
+        elif not previous_parent or previous_parent == os.path.dirname(this_parent):
+            previous_parent = this_parent
+            continue
+        else:
+            # previous dir done, possibly ancestors done too
+            highest_common_dir = os.path.commonpath([this_parent, previous_parent])
+
+            path_to_propagate = os.path.relpath(previous_parent, highest_common_dir)
+            dir_list_to_propagate = path_to_propagate.split(os.sep)[:-1]
+
+            print(f"{previous_parent} done, propegating to ancestors")
+            print(f"Highest common: {highest_common_dir}")
+            print(f"dir list to prop: {dir_list_to_propagate}")
+            propagate_dir(stats, highest_common_dir, dir_list_to_propagate, previous_parent)
+            previous_parent = this_parent
+
+    leading_dir = previous_parent.split(os.sep)[0]
+    highest_common_dir = os.path.commonpath([leading_dir, previous_parent])
+    path_to_propagate = os.path.relpath(previous_parent, highest_common_dir)
+    dir_list_to_propagate = path_to_propagate.split(os.sep)[:-1]
+    print(f"a is currently {stats['a']['file_count']}")
+    print(f"FINAL {previous_parent} done, propegating to ancestors")
+    print(f"Highest common: {highest_common_dir}")
+    print(f"dir list to prop: {dir_list_to_propagate}")
+    propagate_dir(stats, highest_common_dir, dir_list_to_propagate, previous_parent)
+    # propagate_dir(stats, highest_common_dir)
+    # for each in dir_list_to_propagate:
+    #     highest_common_dir = os.path.join(highest_common_dir, each)
+    #     propagate_dir(stats, highest_common_dir)
     return stats
 
-def bytes_to_human_readable(size_in_bytes):
-    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
-        if size_in_bytes < 1024:
-            return f"{size_in_bytes:.2f} {unit}"
-        size_in_bytes /= 1024
 
 if __name__ == "__main__":
     if len(sys.argv) != 2:
@@ -50,17 +79,6 @@ def bytes_to_human_readable(size_in_bytes):
     input_json_file = sys.argv[1]
     username = input_json_file.split(".")[0]
     stats = generate_statistics(input_json_file)
-    human_readable_total = bytes_to_human_readable(stats["total"]["total_size"])
-
-    if stats["total"]["total_size"] < USER_QUOTA:
-        print(f"All ok, user {username} is below quota, consuming {human_readable_total}")
-        sys.exit(0)
-
-    human_readable_quota = bytes_to_human_readable(USER_QUOTA)
-    with open(f"{username}-usage-report.txt", "w") as report:
-        report.write(f"Total usage: {human_readable_total} exceeds quota amount: {human_readable_quota}\n\n")
-        for directory, stat in stats["directories"].items():
-            report.write(f"Directory: {directory}\n")
-            report.write(f"  Total files: {stat['file_count']}\n")
-            report.write(f"  Total size: {bytes_to_human_readable(stat['total_size'])}\n")
-            report.write("\n")
+    for directory, stat in stats.items():
+        print(f"{directory}: {stat['file_count']}")
+

From 8cead5ac4cf12c515621050537d08332ebacdedc Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 16:07:39 -0600
Subject: [PATCH 74/96] Clean up and refactor for simplicity

---
 .github/scripts/calculate-directory-stats.py | 75 +++++++-------------
 1 file changed, 27 insertions(+), 48 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 1fab7e93..439020d4 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -4,31 +4,25 @@
 import gzip
 import json
 import sys
+import unittest
 from collections import defaultdict
 
-USER_QUOTA = 8_000_000_000
-
-# TODO trash files
-
-
-def propagate_dir(stats, highest_common, dir_list, prev_dir):
-    while dir_list:
-        working_dir = os.path.join(highest_common, *dir_list)
-        stats[working_dir]['file_count'] += stats[prev_dir]['file_count']
-        dir_list.pop()
-        prev_dir = working_dir
-    stats[highest_common]['file_count'] += stats[prev_dir]['file_count']
-
-def generate_statistics(input_file):
-    # Load the JSON data from the compressed file
-    with open(input_file, 'r', encoding='utf-8') as json_file:
-        data = json.load(json_file)
+def propagate_dir(stats, current_parent, previous_parent):
+    highest_common = os.path.commonpath([current_parent, previous_parent])
+    path_to_propagate = os.path.relpath(previous_parent, highest_common)
+    nested_dir_list = path_to_propagate.split(os.sep)[:-1]
+    # Add each dir count to all ancestors up to highest common dir
+    while nested_dir_list:
+        working_dir = os.path.join(highest_common, *nested_dir_list)
+        stats[working_dir]['file_count'] += stats[previous_parent]['file_count']
+        nested_dir_list.pop()
+        previous_parent = working_dir
+    stats[highest_common]['file_count'] += stats[previous_parent]['file_count']
+
+def generate_directory_statistics(data):
+    # Assumes dirs are listed depth first (files are listed prior to directories)
 
-    # Dictionary to hold statistics per leading directory
     stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
-
-
-    # Assumes dirs are listed depth first (files are listed prior to directories)
     previous_parent = ""
     for file_metadata in data["files"]:
         print(f"Calculating {file_metadata['path']}")
@@ -38,47 +32,32 @@ def generate_statistics(input_file):
         if previous_parent == this_parent:
             continue
         # going deeper
-        # TODO account for going multiple levels deeper
         elif not previous_parent or previous_parent == os.path.dirname(this_parent):
             previous_parent = this_parent
             continue
-        else:
-            # previous dir done, possibly ancestors done too
-            highest_common_dir = os.path.commonpath([this_parent, previous_parent])
-
-            path_to_propagate = os.path.relpath(previous_parent, highest_common_dir)
-            dir_list_to_propagate = path_to_propagate.split(os.sep)[:-1]
-
-            print(f"{previous_parent} done, propegating to ancestors")
-            print(f"Highest common: {highest_common_dir}")
-            print(f"dir list to prop: {dir_list_to_propagate}")
-            propagate_dir(stats, highest_common_dir, dir_list_to_propagate, previous_parent)
+        else:  # previous dir done
+            propagate_dir(stats, this_parent, previous_parent)
             previous_parent = this_parent
 
+    # Run a final time with the root directory as this parent
     leading_dir = previous_parent.split(os.sep)[0]
-    highest_common_dir = os.path.commonpath([leading_dir, previous_parent])
-    path_to_propagate = os.path.relpath(previous_parent, highest_common_dir)
-    dir_list_to_propagate = path_to_propagate.split(os.sep)[:-1]
-    print(f"a is currently {stats['a']['file_count']}")
-    print(f"FINAL {previous_parent} done, propegating to ancestors")
-    print(f"Highest common: {highest_common_dir}")
-    print(f"dir list to prop: {dir_list_to_propagate}")
-    propagate_dir(stats, highest_common_dir, dir_list_to_propagate, previous_parent)
-    # propagate_dir(stats, highest_common_dir)
-    # for each in dir_list_to_propagate:
-    #     highest_common_dir = os.path.join(highest_common_dir, each)
-    #     propagate_dir(stats, highest_common_dir)
+    propagate_dir(stats, leading_dir, previous_parent)
     return stats
 
-
-if __name__ == "__main__":
+def main():
     if len(sys.argv) != 2:
         print("Usage: python script.py <input_json_file>")
         sys.exit(1)
 
     input_json_file = sys.argv[1]
     username = input_json_file.split(".")[0]
-    stats = generate_statistics(input_json_file)
+    with open(input_json_file, 'r', encoding='utf-8') as json_file:
+        data = json.load(json_file)
+
+    stats = generate_directory_statistics(data)
     for directory, stat in stats.items():
         print(f"{directory}: {stat['file_count']}")
 
+if __name__ == "__main__":
+    main()
+

From 86a7c7243e2a9d1afc8f81d33e4c0011087e14e4 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 16:49:25 -0600
Subject: [PATCH 75/96] Add basic tests

---
 .github/scripts/calculate-directory-stats.py | 33 ++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 439020d4..bdd3a92c 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -58,6 +58,35 @@ def main():
     for directory, stat in stats.items():
         print(f"{directory}: {stat['file_count']}")
 
-if __name__ == "__main__":
-    main()
 
+class TestDirectoryStatistics(unittest.TestCase):
+    def test_propagate_dir(self):
+        stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+        stats["a/b/c"] = {"total_size": 0, "file_count": 3}
+        stats["a/b"] = {"total_size": 0, "file_count": 0}
+        stats["a"] = {"total_size": 0, "file_count": 0}
+
+        propagate_dir(stats, "a", "a/b/c")
+        self.assertEqual(stats["a"]["file_count"], 3)
+        self.assertEqual(stats["a/b"]["file_count"], 3)
+
+    def test_generate_directory_statistics(self):
+        sample_data = {
+            "files": [
+                {"path": "a/b/file3.txt"},
+                {"path": "a/b/c/file1.txt"},
+                {"path": "a/b/c/file2.txt"},
+                {"path": "a/b/c/d/file4.txt"}
+            ]
+        }
+        stats = generate_directory_statistics(sample_data)
+        self.assertEqual(stats["a/b/c/d"]["file_count"], 1)
+        self.assertEqual(stats["a/b/c"]["file_count"], 3)
+        self.assertEqual(stats["a/b"]["file_count"], 4)
+        self.assertEqual(stats["a"]["file_count"], 4)
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "test":
+        unittest.main(argv=sys.argv[:1])  # Run tests if "test" is provided as an argument
+    else:
+        main()

From fc1cab1b62259d3b18f8ffe2e28356d2a98d2aa8 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 16:53:48 -0600
Subject: [PATCH 76/96] test multiple directories in root

---
 .github/scripts/calculate-directory-stats.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index bdd3a92c..e1dcd8cc 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -25,7 +25,6 @@ def generate_directory_statistics(data):
     stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
     previous_parent = ""
     for file_metadata in data["files"]:
-        print(f"Calculating {file_metadata['path']}")
         this_parent = os.path.dirname(file_metadata["path"])
         stats[this_parent]["file_count"] += 1
 
@@ -70,20 +69,29 @@ def test_propagate_dir(self):
         self.assertEqual(stats["a"]["file_count"], 3)
         self.assertEqual(stats["a/b"]["file_count"], 3)
 
+
     def test_generate_directory_statistics(self):
         sample_data = {
             "files": [
                 {"path": "a/b/file3.txt"},
                 {"path": "a/b/c/file1.txt"},
                 {"path": "a/b/c/file2.txt"},
-                {"path": "a/b/c/d/file4.txt"}
+                {"path": "a/b/c/d/file4.txt"},
+                {"path": "a/e/file3.txt"},
+                {"path": "a/e/f/file1.txt"},
+                {"path": "a/e/f/file2.txt"},
+                {"path": "a/e/f/g/file4.txt"}
             ]
         }
         stats = generate_directory_statistics(sample_data)
         self.assertEqual(stats["a/b/c/d"]["file_count"], 1)
         self.assertEqual(stats["a/b/c"]["file_count"], 3)
         self.assertEqual(stats["a/b"]["file_count"], 4)
-        self.assertEqual(stats["a"]["file_count"], 4)
+        self.assertEqual(stats["a/e/f/g"]["file_count"], 1)
+        self.assertEqual(stats["a/e/f"]["file_count"], 3)
+        self.assertEqual(stats["a/e"]["file_count"], 4)
+        self.assertEqual(stats["a"]["file_count"], 8)
+
 
 if __name__ == "__main__":
     if len(sys.argv) > 1 and sys.argv[1] == "test":

From 2308aed1b153e8789c0ba051bc7fb89004995702 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 16:55:37 -0600
Subject: [PATCH 77/96] comment about [:-1]

---
 .github/scripts/calculate-directory-stats.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index e1dcd8cc..87b4b8c7 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -10,6 +10,7 @@
 def propagate_dir(stats, current_parent, previous_parent):
     highest_common = os.path.commonpath([current_parent, previous_parent])
     path_to_propagate = os.path.relpath(previous_parent, highest_common)
+    # leaves off last to avoid propagating to the path we are propagating from
     nested_dir_list = path_to_propagate.split(os.sep)[:-1]
     # Add each dir count to all ancestors up to highest common dir
     while nested_dir_list:

From 84754fe0c20e45b2dc188fdefdfcf5695c796c38 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 20:27:39 -0600
Subject: [PATCH 78/96] support abspaths

---
 .github/scripts/calculate-directory-stats.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 87b4b8c7..085145f2 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -8,7 +8,11 @@
 from collections import defaultdict
 
 def propagate_dir(stats, current_parent, previous_parent):
+    assert os.path.isabs(current_parent) == os.path.isabs(previous_parent), \
+        "current_parent and previous_parent must both be abspath or both be relpath"
     highest_common = os.path.commonpath([current_parent, previous_parent])
+    assert highest_common, "highest_common must either be a target directory or /"
+
     path_to_propagate = os.path.relpath(previous_parent, highest_common)
     # leaves off last to avoid propagating to the path we are propagating from
     nested_dir_list = path_to_propagate.split(os.sep)[:-1]
@@ -40,7 +44,9 @@ def generate_directory_statistics(data):
             previous_parent = this_parent
 
     # Run a final time with the root directory as this parent
-    leading_dir = previous_parent.split(os.sep)[0]
+    # During final run, leading dir cannot be empty string, propagate_dir requires
+    # both to be abspath or both to be relpath
+    leading_dir = previous_parent.split(os.sep)[0] or "/"
     propagate_dir(stats, leading_dir, previous_parent)
     return stats
 
@@ -70,6 +76,15 @@ def test_propagate_dir(self):
         self.assertEqual(stats["a"]["file_count"], 3)
         self.assertEqual(stats["a/b"]["file_count"], 3)
 
+    def test_propagate_dir_abs_path(self):
+        stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+        stats["/a/b/c"] = {"total_size": 0, "file_count": 3}
+        stats["/a/b"] = {"total_size": 0, "file_count": 0}
+        stats["/a"] = {"total_size": 0, "file_count": 0}
+
+        propagate_dir(stats, "/a", "/a/b/c")
+        self.assertEqual(stats["/a"]["file_count"], 3)
+        self.assertEqual(stats["/a/b"]["file_count"], 3)
 
     def test_generate_directory_statistics(self):
         sample_data = {

From a1427acedafa7880fd223d88ad0d0083bf8c0f66 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 20:33:51 -0600
Subject: [PATCH 79/96] [DATALAD RUNCMD] blacken

=== Do not change lines below ===
{
 "chain": [],
 "cmd": "black .github/scripts/calculate-directory-stats.py",
 "exit": 0,
 "extra_inputs": [],
 "inputs": [],
 "outputs": [
  ".github/scripts/calculate-directory-stats.py"
 ],
 "pwd": "."
}
^^^ Do not change lines above ^^^
---
 .github/scripts/calculate-directory-stats.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 085145f2..9ff2bd6f 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -7,9 +7,11 @@
 import unittest
 from collections import defaultdict
 
+
 def propagate_dir(stats, current_parent, previous_parent):
-    assert os.path.isabs(current_parent) == os.path.isabs(previous_parent), \
-        "current_parent and previous_parent must both be abspath or both be relpath"
+    assert os.path.isabs(current_parent) == os.path.isabs(
+        previous_parent
+    ), "current_parent and previous_parent must both be abspath or both be relpath"
     highest_common = os.path.commonpath([current_parent, previous_parent])
     assert highest_common, "highest_common must either be a target directory or /"
 
@@ -19,10 +21,11 @@ def propagate_dir(stats, current_parent, previous_parent):
     # Add each dir count to all ancestors up to highest common dir
     while nested_dir_list:
         working_dir = os.path.join(highest_common, *nested_dir_list)
-        stats[working_dir]['file_count'] += stats[previous_parent]['file_count']
+        stats[working_dir]["file_count"] += stats[previous_parent]["file_count"]
         nested_dir_list.pop()
         previous_parent = working_dir
-    stats[highest_common]['file_count'] += stats[previous_parent]['file_count']
+    stats[highest_common]["file_count"] += stats[previous_parent]["file_count"]
+
 
 def generate_directory_statistics(data):
     # Assumes dirs are listed depth first (files are listed prior to directories)
@@ -50,6 +53,7 @@ def generate_directory_statistics(data):
     propagate_dir(stats, leading_dir, previous_parent)
     return stats
 
+
 def main():
     if len(sys.argv) != 2:
         print("Usage: python script.py <input_json_file>")
@@ -57,7 +61,7 @@ def main():
 
     input_json_file = sys.argv[1]
     username = input_json_file.split(".")[0]
-    with open(input_json_file, 'r', encoding='utf-8') as json_file:
+    with open(input_json_file, "r", encoding="utf-8") as json_file:
         data = json.load(json_file)
 
     stats = generate_directory_statistics(data)
@@ -96,7 +100,7 @@ def test_generate_directory_statistics(self):
                 {"path": "a/e/file3.txt"},
                 {"path": "a/e/f/file1.txt"},
                 {"path": "a/e/f/file2.txt"},
-                {"path": "a/e/f/g/file4.txt"}
+                {"path": "a/e/f/g/file4.txt"},
             ]
         }
         stats = generate_directory_statistics(sample_data)
@@ -111,6 +115,8 @@ def test_generate_directory_statistics(self):
 
 if __name__ == "__main__":
     if len(sys.argv) > 1 and sys.argv[1] == "test":
-        unittest.main(argv=sys.argv[:1])  # Run tests if "test" is provided as an argument
+        unittest.main(
+            argv=sys.argv[:1]
+        )  # Run tests if "test" is provided as an argument
     else:
         main()

From 16e48904684b92f5d0bdf5432fe52748242cad94 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 13 Dec 2024 20:55:43 -0600
Subject: [PATCH 80/96] test propagation with files in all dirs

---
 .github/scripts/calculate-directory-stats.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 9ff2bd6f..e56815b8 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -90,6 +90,17 @@ def test_propagate_dir_abs_path(self):
         self.assertEqual(stats["/a"]["file_count"], 3)
         self.assertEqual(stats["/a/b"]["file_count"], 3)
 
+    def test_propagate_dir_files_in_all(self):
+        stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
+        stats["a/b/c"] = {"total_size": 0, "file_count": 3}
+        stats["a/b"] = {"total_size": 0, "file_count": 2}
+        stats["a"] = {"total_size": 0, "file_count": 1}
+
+        propagate_dir(stats, "a", "a/b/c")
+        self.assertEqual(stats["a"]["file_count"], 6)
+        self.assertEqual(stats["a/b"]["file_count"], 5)
+
+
     def test_generate_directory_statistics(self):
         sample_data = {
             "files": [

From 528833d811ac1b66d6cc68421898446a3950a507 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sat, 14 Dec 2024 22:14:47 -0600
Subject: [PATCH 81/96] Write files to disk as they are inspected

Holding the entire file index in memory would be risky, especially with
zarr files
---
 .github/scripts/create-file-index.py | 110 +++++++++++++++++----------
 1 file changed, 70 insertions(+), 40 deletions(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index 9c480f9f..065ffc20 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -1,62 +1,92 @@
 #!/usr/bin/env python3
 
 import os
+import csv
 import time
-import json
 import sys
 from datetime import datetime
+from pathlib import Path
 
-def list_files_with_metadata(directory, output_file):
-    # Record the start time
-    start_time = time.time()
 
-    # Get the current date and time for indexing
-    index_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+class MetadataWriter:
+    def __init__(self, output_path):
+        self.output_path = Path(output_path)
+        self.start_time = None
+        self.end_time = None
+        self.meta = {
+            "index_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+            "duration": None,
+            "total_files": 0,
+        }
+        self.file = None
+        self.writer = None
 
-    files_metadata = []
+    def start(self):
+        """Initialize the metadata and open the file for writing."""
+        self.start_time = time.time()
+        self.file = self.output_path.open(mode="w", newline="", encoding="utf-8")
+        self.writer = csv.writer(self.file, delimiter="\t")
+        self.writer.writerow(["file_name", "file_size", "file_type", "custom_metadata"])
 
+    def write_row(self, file_name, file_size, created, modified, error):
+        """Write data for a file."""
+        if not self.writer:
+            raise RuntimeError("Writer not initialized.")
+        if error is not None:
+            self.writer.writerow([file_name, error])
+        else:
+            self.writer.writerow([file_name, file_size, created, modified])
+
+        self.meta["total_files"] += 1
+
+    def finish(self):
+        """Finalize metadata, write it to the file, and close the file."""
+        if not self.writer:
+            raise RuntimeError("Writer not initialized.")
+        self.end_time = time.time()
+        self.meta["duration"] = self.end_time - self.start_time
+
+        self.file.write("\n# Execution Metadata\n")
+        for key, value in self.meta.items():
+            self.file.write(f"# {key}: {value}\n")
+
+        self.file.close()
+
+    def get_meta(self):
+        """Return the meta-metadata dictionary."""
+        return self.meta
+
+
+def directory_index(directory):
     for root, dirs, files in os.walk(directory):
         for name in files:
             filepath = os.path.join(root, name)
             try:
-                metadata = {
-                    "path": filepath,
-                    "size": os.path.getsize(filepath),
-                    "modified": time.ctime(os.path.getmtime(filepath)),
-                    "created": time.ctime(os.path.getctime(filepath))
-                }
-                files_metadata.append(metadata)
+                stat_result = os.stat(filepath, follow_symlinks=False)
             except (FileNotFoundError, PermissionError) as e:
-                print(f"Skipping {filepath}: {e}")
-
-    # Record the end time and calculate the duration
-    end_time = time.time()
-    duration = end_time - start_time
-
-    # Prepare the output data with additional metadata
-    output_data = {
-        "index_timestamp": index_timestamp,
-        "duration_seconds": duration,
-        "files": files_metadata
-    }
-
-    # Write the output data to a .json file
-    with open(output_file, "w", encoding="utf-8") as json_file:
-        json.dump(output_data, json_file, indent=4)
-
-    print(f"Indexing completed. Compressed results written to {output_file}")
+                size = modified = created = None
+                error = str(e)
+            else: 
+                size = stat_result.st_size
+                modified = time.ctime(stat_result.st_mtime)
+                created = time.ctime(stat_result.st_ctime)
+                error = None
+            yield filepath, size, modified, created, error
 
 # Ensure the script is called with the required arguments
 if __name__ == "__main__":
     if len(sys.argv) != 3:
-        print("Usage: python script.py <directory_to_index> <output_json_file>")
+        print("Usage: python script.py <directory_to_index> <output_tsv_file>")
         sys.exit(1)
 
-    directory_to_index = sys.argv[1]
-    output_json_file = sys.argv[2]
-
-    # Ensure the output filename ends with .json for clarity
-    if not output_json_file.endswith(".json"):
-        output_json_file += ".json"
+    directory = sys.argv[1]
+    output_file = sys.argv[2]
 
-    list_files_with_metadata(directory_to_index, output_json_file)
+    # Ensure the output filename ends with .tsv for clarity
+    if not output_file.endswith(".tsv"):
+        output_file += ".tsv"
+    file_index = MetadataWriter(output_file)
+    file_index.start()
+    for filename, size, created, modified, error in directory_index(directory):
+        file_index.write_row(filename, size, created, modified, error)
+    file_index.finish()

From 3c0e7f757f1c67e5ac1776cf016ae3fbf6f71f8c Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sat, 14 Dec 2024 22:48:43 -0600
Subject: [PATCH 82/96] Comment out column headers in output

---
 .github/scripts/create-file-index.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index 065ffc20..354b3f44 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -26,7 +26,7 @@ def start(self):
         self.start_time = time.time()
         self.file = self.output_path.open(mode="w", newline="", encoding="utf-8")
         self.writer = csv.writer(self.file, delimiter="\t")
-        self.writer.writerow(["file_name", "file_size", "file_type", "custom_metadata"])
+        self.writer.writerow(["#file_name", "file_size", "file_type", "custom_metadata"])
 
     def write_row(self, file_name, file_size, created, modified, error):
         """Write data for a file."""

From 260c69d4166d4541d3c5dae0afebca343e8cbbee Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sat, 14 Dec 2024 22:49:03 -0600
Subject: [PATCH 83/96] Write all fields for every file

Whether there is an error or not, write all fields for each file.
This allows us to read the tsv file and unpack the same number of values
for each line.
---
 .github/scripts/create-file-index.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index 354b3f44..cfacffd8 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -33,9 +33,9 @@ def write_row(self, file_name, file_size, created, modified, error):
         if not self.writer:
             raise RuntimeError("Writer not initialized.")
         if error is not None:
-            self.writer.writerow([file_name, error])
+            self.writer.writerow([file_name, "-", "-", "-", error])
         else:
-            self.writer.writerow([file_name, file_size, created, modified])
+            self.writer.writerow([file_name, file_size, created, modified, "OK"])
 
         self.meta["total_files"] += 1
 

From 87dd8ca219ab5cbe82c32f3ab53bdcfdb44dcd21 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sat, 14 Dec 2024 22:51:08 -0600
Subject: [PATCH 84/96] Convert to reading tsv

---
 .github/scripts/calculate-directory-stats.py | 33 ++++++++++++++------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index e56815b8..0fbbd0ec 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3
 
 import os
-import gzip
+import csv
 import json
 import sys
 import unittest
 from collections import defaultdict
+from pathlib import Path
+from typing import Iterable
 
 
 def propagate_dir(stats, current_parent, previous_parent):
@@ -27,13 +29,14 @@ def propagate_dir(stats, current_parent, previous_parent):
     stats[highest_common]["file_count"] += stats[previous_parent]["file_count"]
 
 
-def generate_directory_statistics(data):
+def generate_directory_statistics(data: Iterable[str]):
     # Assumes dirs are listed depth first (files are listed prior to directories)
 
     stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
     previous_parent = ""
-    for file_metadata in data["files"]:
-        this_parent = os.path.dirname(file_metadata["path"])
+    for filepath, size, modified, created, error in data:
+        # TODO if error is not None:
+        this_parent = os.path.dirname(filepath)
         stats[this_parent]["file_count"] += 1
 
         if previous_parent == this_parent:
@@ -54,16 +57,29 @@ def generate_directory_statistics(data):
     return stats
 
 
+def iter_file_metadata(file_path):
+    """
+    Reads a tsv and returns an iterable that yields one row of file metadata at
+    a time, excluding comments.
+    """
+    file_path = Path(file_path)
+    with file_path.open(mode="r", newline="", encoding="utf-8") as file:
+        reader = csv.reader(file, delimiter="\t")
+        for row in reader:
+            # Skip empty lines or lines starting with '#'
+            if not row or row[0].startswith("#"):
+                continue
+            yield row
+
 def main():
     if len(sys.argv) != 2:
         print("Usage: python script.py <input_json_file>")
         sys.exit(1)
 
-    input_json_file = sys.argv[1]
-    username = input_json_file.split(".")[0]
-    with open(input_json_file, "r", encoding="utf-8") as json_file:
-        data = json.load(json_file)
+    input_tsv_file = sys.argv[1]
+    username = input_tsv_file.split(".")[0]
 
+    data = iter_file_metadata(input_tsv_file)
     stats = generate_directory_statistics(data)
     for directory, stat in stats.items():
         print(f"{directory}: {stat['file_count']}")
@@ -100,7 +116,6 @@ def test_propagate_dir_files_in_all(self):
         self.assertEqual(stats["a"]["file_count"], 6)
         self.assertEqual(stats["a/b"]["file_count"], 5)
 
-
     def test_generate_directory_statistics(self):
         sample_data = {
             "files": [

From e0e0a32b05c407ccb177e37d50ccfbad7d365364 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sat, 14 Dec 2024 23:01:34 -0600
Subject: [PATCH 85/96] Fixup: update test to match tsv-read data

---
 .github/scripts/calculate-directory-stats.py | 22 +++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 0fbbd0ec..14dff676 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -117,18 +117,16 @@ def test_propagate_dir_files_in_all(self):
         self.assertEqual(stats["a/b"]["file_count"], 5)
 
     def test_generate_directory_statistics(self):
-        sample_data = {
-            "files": [
-                {"path": "a/b/file3.txt"},
-                {"path": "a/b/c/file1.txt"},
-                {"path": "a/b/c/file2.txt"},
-                {"path": "a/b/c/d/file4.txt"},
-                {"path": "a/e/file3.txt"},
-                {"path": "a/e/f/file1.txt"},
-                {"path": "a/e/f/file2.txt"},
-                {"path": "a/e/f/g/file4.txt"},
-            ]
-        }
+        sample_data = [
+            ("a/b/file3.txt", 3456, "2024-12-01", "2024-12-02", "OK"),
+            ("a/b/c/file1.txt", 1234, "2024-12-01", "2024-12-02", "OK"),
+            ("a/b/c/file2.txt", 2345, "2024-12-01", "2024-12-02", "OK"),
+            ("a/b/c/d/file4.txt", 4567, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/file3.txt", 5678, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/f/file1.txt", 6789, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/f/file2.txt", 7890, "2024-12-01", "2024-12-02", "OK"),
+            ("a/e/f/g/file4.txt", 8901, "2024-12-01", "2024-12-02", "OK"),
+        ]
         stats = generate_directory_statistics(sample_data)
         self.assertEqual(stats["a/b/c/d"]["file_count"], 1)
         self.assertEqual(stats["a/b/c"]["file_count"], 3)

From 41aaa2aaa3d390fc16869bca98448ae37e3d2050 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 11:04:47 -0600
Subject: [PATCH 86/96] update for renamed script

---
 .github/scripts/launch-ec2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 1c3cd615..99e90755 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -96,7 +96,7 @@ echo "export PUBLIC_IP=$PUBLIC_IP" >> $ENV_FILE
 # Upload scripts to EC2 instance
 echo "Uploading scripts to EC2 instance..."
 scp -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" \
-  $LOCAL_SCRIPTS_DIR/produce-report.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
+  $LOCAL_SCRIPTS_DIR/calculate-directory-stats.py $LOCAL_SCRIPTS_DIR/create-file-index.py \
   ec2-user@"$PUBLIC_IP":"$REMOTE_SCRIPTS_DIR/"
 
 if [ $? -eq 0 ]; then

From 25e27ebb1040073fbc3889925c88546d4ec4d504 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 11:22:00 -0600
Subject: [PATCH 87/96] install pip

---
 .github/scripts/launch-ec2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 99e90755..9bd7e51b 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -119,7 +119,7 @@ fi
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
 ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
-  "sudo yum install -y amazon-efs-utils && \
+  "sudo yum install -y amazon-efs-utils pip && \
    sudo mkdir -p $MOUNT_POINT && \
    sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \
    echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \

From 204b70ec731bacc03daaf71ab3cb658c98404737 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 11:22:22 -0600
Subject: [PATCH 88/96] install parallel

---
 .github/scripts/launch-ec2.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index 9bd7e51b..dc70e4d2 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -119,7 +119,7 @@ fi
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
 ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
-  "sudo yum install -y amazon-efs-utils pip && \
+  "sudo yum install -y amazon-efs-utils pip parallel && \
    sudo mkdir -p $MOUNT_POINT && \
    sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \
    echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \

From 64d653e23cfb34c52b5db161efa7243d150bb181 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 12:06:11 -0600
Subject: [PATCH 89/96] install dependencies in launch script

---
 .github/scripts/launch-ec2.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/launch-ec2.sh b/.github/scripts/launch-ec2.sh
index dc70e4d2..0c8ab676 100755
--- a/.github/scripts/launch-ec2.sh
+++ b/.github/scripts/launch-ec2.sh
@@ -116,11 +116,15 @@ fi
 #   --port 2049 \
 #   --source-group $SECURITY_GROUP_ID
 
+echo "Installing dependencies ..."
+ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
+  "sudo yum install -y amazon-efs-utils pip parallel && \
+  pip install con-duct"
+
 # Mount EFS on the EC2 instance
 echo "Mounting EFS on the EC2 instance..."
 ssh -i "$EC2_SSH_KEY" -o "StrictHostKeyChecking=no" ec2-user@"$PUBLIC_IP" \
-  "sudo yum install -y amazon-efs-utils pip parallel && \
-   sudo mkdir -p $MOUNT_POINT && \
+   "sudo mkdir -p $MOUNT_POINT && \
    sudo mount -t efs $EFS_ID:/ $MOUNT_POINT && \
    echo '$EFS_ID:/ $MOUNT_POINT efs defaults,_netdev 0 0' | sudo tee -a /etc/fstab && \
    echo 'EFS mounted at $MOUNT_POINT'"

From 6475f11d740d766540b52821c0967be8b8ba0538 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 12:30:09 -0600
Subject: [PATCH 90/96] Output to tmp, accept only 1 arg, target dir

---
 .github/scripts/create-file-index.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index cfacffd8..f5345146 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -7,6 +7,7 @@
 from datetime import datetime
 from pathlib import Path
 
+OUTPUT_DIR = "/tmp/hub-user-indexes"
 
 class MetadataWriter:
     def __init__(self, output_path):
@@ -75,16 +76,16 @@ def directory_index(directory):
 
 # Ensure the script is called with the required arguments
 if __name__ == "__main__":
-    if len(sys.argv) != 3:
-        print("Usage: python script.py <directory_to_index> <output_tsv_file>")
+    if len(sys.argv) != 2:
+        print("Usage: python script.py <directory_to_index>")
         sys.exit(1)
 
     directory = sys.argv[1]
-    output_file = sys.argv[2]
 
     # Ensure the output filename ends with .tsv for clarity
-    if not output_file.endswith(".tsv"):
-        output_file += ".tsv"
+
+    os.makedirs(OUTPUT_DIR)
+    output_file = f"{OUTPUT_DIR}/{directory}-index.tsv"
     file_index = MetadataWriter(output_file)
     file_index.start()
     for filename, size, created, modified, error in directory_index(directory):

From b67c0638a3cb12823f62b30f1254c289b847c32f Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 18:10:23 -0600
Subject: [PATCH 91/96] add up sizes

---
 .github/scripts/calculate-directory-stats.py | 20 ++++++++++++++++----
 .github/scripts/create-file-index.py         |  2 ++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 14dff676..3f96a3d6 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -24,9 +24,11 @@ def propagate_dir(stats, current_parent, previous_parent):
     while nested_dir_list:
         working_dir = os.path.join(highest_common, *nested_dir_list)
         stats[working_dir]["file_count"] += stats[previous_parent]["file_count"]
+        stats[working_dir]["total_size"] += stats[previous_parent]["total_size"]
         nested_dir_list.pop()
         previous_parent = working_dir
     stats[highest_common]["file_count"] += stats[previous_parent]["file_count"]
+    stats[highest_common]["total_size"] += stats[previous_parent]["total_size"]
 
 
 def generate_directory_statistics(data: Iterable[str]):
@@ -38,6 +40,7 @@ def generate_directory_statistics(data: Iterable[str]):
         # TODO if error is not None:
         this_parent = os.path.dirname(filepath)
         stats[this_parent]["file_count"] += 1
+        stats[this_parent]["total_size"] += int(size)
 
         if previous_parent == this_parent:
             continue
@@ -82,19 +85,28 @@ def main():
     data = iter_file_metadata(input_tsv_file)
     stats = generate_directory_statistics(data)
     for directory, stat in stats.items():
-        print(f"{directory}: {stat['file_count']}")
+        if stat['total_size'] > 10000000:
+            print(f"{directory}: File count: {stat['file_count']}, Total Size: {stat['total_size']}")
+    data2 = iter_file_metadata(input_tsv_file)
+    sanity_size = 0
+    for filepath, size, modified, created, error in data2:
+        sanity_size += int(size)
+    print(f"SANITY SIZE {sanity_size}")
+
+
 
 
 class TestDirectoryStatistics(unittest.TestCase):
     def test_propagate_dir(self):
         stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
-        stats["a/b/c"] = {"total_size": 0, "file_count": 3}
-        stats["a/b"] = {"total_size": 0, "file_count": 0}
-        stats["a"] = {"total_size": 0, "file_count": 0}
+        stats["a/b/c"] = {"total_size": 100, "file_count": 3}
+        stats["a/b"] = {"total_size": 10, "file_count": 0}
+        stats["a"] = {"total_size": 1, "file_count": 0}
 
         propagate_dir(stats, "a", "a/b/c")
         self.assertEqual(stats["a"]["file_count"], 3)
         self.assertEqual(stats["a/b"]["file_count"], 3)
+        self.assertEqual(stats["a"]["total_size"], 111)
 
     def test_propagate_dir_abs_path(self):
         stats = defaultdict(lambda: {"total_size": 0, "file_count": 0})
diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index f5345146..97f3804b 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -18,6 +18,7 @@ def __init__(self, output_path):
             "index_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
             "duration": None,
             "total_files": 0,
+            "total_size": 0,
         }
         self.file = None
         self.writer = None
@@ -39,6 +40,7 @@ def write_row(self, file_name, file_size, created, modified, error):
             self.writer.writerow([file_name, file_size, created, modified, "OK"])
 
         self.meta["total_files"] += 1
+        self.meta["total_size"] += file_size
 
     def finish(self):
         """Finalize metadata, write it to the file, and close the file."""

From 32414732d5418704a6bb96c036760001c412829a Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 18:10:50 -0600
Subject: [PATCH 92/96] print useful info as index is created

---
 .github/scripts/create-file-index.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index 97f3804b..c8ffc7f0 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -54,6 +54,7 @@ def finish(self):
             self.file.write(f"# {key}: {value}\n")
 
         self.file.close()
+        print(f"Directory {self.output_path} complete, Duration: {self.meta['duration']:.2f}, Total Files: {self.meta['total_files']}, Total Size: {self.meta['total_size']}")
 
     def get_meta(self):
         """Return the meta-metadata dictionary."""

From f4eb1018561ea8406ccc05381a43787d26e2a511 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Sun, 15 Dec 2024 18:11:36 -0600
Subject: [PATCH 93/96] dont fail if output dir exists

---
 .github/scripts/create-file-index.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/scripts/create-file-index.py b/.github/scripts/create-file-index.py
index c8ffc7f0..5f8cb661 100755
--- a/.github/scripts/create-file-index.py
+++ b/.github/scripts/create-file-index.py
@@ -85,12 +85,13 @@ def directory_index(directory):
 
     directory = sys.argv[1]
 
-    # Ensure the output filename ends with .tsv for clarity
-
-    os.makedirs(OUTPUT_DIR)
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
     output_file = f"{OUTPUT_DIR}/{directory}-index.tsv"
+
     file_index = MetadataWriter(output_file)
     file_index.start()
+
     for filename, size, created, modified, error in directory_index(directory):
         file_index.write_row(filename, size, created, modified, error)
+
     file_index.finish()

From 13e0e75761bff01f08c235a8b33961ed75d6b1ac Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Mon, 16 Dec 2024 10:41:42 -0600
Subject: [PATCH 94/96] Create a report dict with only relevant stats

---
 .github/scripts/calculate-directory-stats.py | 42 +++++++++++++++-----
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 3f96a3d6..668e2324 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -7,6 +7,7 @@
 import unittest
 from collections import defaultdict
 from pathlib import Path
+from pprint import pprint
 from typing import Iterable
 
 
@@ -74,26 +75,47 @@ def iter_file_metadata(file_path):
                 continue
             yield row
 
+def update_stats(stats, directory, stat):
+    stats["total_size"] += stat["total_size"]
+    stats["file_count"] += stat["file_count"]
+
+    # Caches track directories, but not report as a whole
+    if stats.get("directories") is not None:
+        stats["directories"].append(directory)
+
 def main():
     if len(sys.argv) != 2:
         print("Usage: python script.py <input_json_file>")
         sys.exit(1)
 
     input_tsv_file = sys.argv[1]
-    username = input_tsv_file.split(".")[0]
+    username = input_tsv_file.split("-index.tsv")[0]
 
     data = iter_file_metadata(input_tsv_file)
     stats = generate_directory_statistics(data)
+    cache_types = ["pycache", "user_cache", "yarn_cache", "pip_cache", "nwb_cache"]
+    report_stats = {
+        "total_size": 0,
+        "file_count": 0,
+        "caches": {
+            cache_type: {"total_size": 0, "file_count": 0, "directories": []}
+            for cache_type in cache_types
+        }
+    }
+    # print(f"{directory}: File count: {stat['file_count']}, Total Size: {stat['total_size']}")
     for directory, stat in stats.items():
-        if stat['total_size'] > 10000000:
-            print(f"{directory}: File count: {stat['file_count']}, Total Size: {stat['total_size']}")
-    data2 = iter_file_metadata(input_tsv_file)
-    sanity_size = 0
-    for filepath, size, modified, created, error in data2:
-        sanity_size += int(size)
-    print(f"SANITY SIZE {sanity_size}")
-
-
+        if directory.endswith("__pycache__"):
+            update_stats(report_stats["caches"]["pycache"], directory, stat)
+        elif directory.endswith(f"{username}/.cache"):
+            update_stats(report_stats["caches"]["user_cache"], directory, stat)
+        elif directory.endswith(".cache/yarn"):
+            update_stats(report_stats["caches"]["yarn_cache"], directory, stat)
+        elif directory.endswith(".cache/pip"):
+            update_stats(report_stats["caches"]["pip_cache"], directory, stat)
+        elif directory == username:
+            update_stats(report_stats, username, stat)
+
+    pprint(report_stats)
 
 
 class TestDirectoryStatistics(unittest.TestCase):

From a7e6991c15e328f1ec5b706b7e4350b34a9543ee Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 20 Dec 2024 13:28:06 -0600
Subject: [PATCH 95/96] output data reports

---
 .github/scripts/calculate-directory-stats.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/calculate-directory-stats.py b/.github/scripts/calculate-directory-stats.py
index 668e2324..c523123a 100755
--- a/.github/scripts/calculate-directory-stats.py
+++ b/.github/scripts/calculate-directory-stats.py
@@ -102,6 +102,7 @@ def main():
             for cache_type in cache_types
         }
     }
+
     # print(f"{directory}: File count: {stat['file_count']}, Total Size: {stat['total_size']}")
     for directory, stat in stats.items():
         if directory.endswith("__pycache__"):
@@ -115,7 +116,14 @@ def main():
         elif directory == username:
             update_stats(report_stats, username, stat)
 
-    pprint(report_stats)
+    OUTPUT_DIR = "/home/austin/hub-user-reports/"
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    with open(f"{OUTPUT_DIR}{username}-report.json", "w") as out:
+        json.dump(report_stats, out)
+
+
+    sorted_dirs = sorted(stats.items(), key=lambda x: x[1]['total_size'], reverse=True)
+    print(f"Finished {username} with Total {report_stats["total_size"]}")
 
 
 class TestDirectoryStatistics(unittest.TestCase):
@@ -177,4 +185,9 @@ def test_generate_directory_statistics(self):
             argv=sys.argv[:1]
         )  # Run tests if "test" is provided as an argument
     else:
-        main()
+        try:
+            main()
+        except Exception as e:
+            # print(f"FAILED ------------------------------ {sys.argv[1]}")
+            # raise(e)
+            pass

From 845df003b5fe5c7988adb8724a756532622d7d25 Mon Sep 17 00:00:00 2001
From: Austin Macdonald <austin@dartmouth.edu>
Date: Fri, 17 Jan 2025 12:17:56 -0600
Subject: [PATCH 96/96] Remove unused

---
 .github/manifests/disk-usage-report-job.yaml |  35 ----
 .github/manifests/hello-world-pod.yaml       |  20 --
 .github/workflows/report.yaml                | 183 -------------------
 3 files changed, 238 deletions(-)
 delete mode 100644 .github/manifests/disk-usage-report-job.yaml
 delete mode 100644 .github/manifests/hello-world-pod.yaml
 delete mode 100644 .github/workflows/report.yaml

diff --git a/.github/manifests/disk-usage-report-job.yaml b/.github/manifests/disk-usage-report-job.yaml
deleted file mode 100644
index 161f1778..00000000
--- a/.github/manifests/disk-usage-report-job.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: disk-usage-report-job
-  namespace: jupyterhub
-spec:
-  backoffLimit: 0  # No retry on failure
-  template:
-    metadata:
-      labels:
-        app: disk-usage-report
-    spec:
-      containers:
-        - name: disk-usage-report
-          image: dandiarchive/dandihub-report-generator:latest
-          args:
-            - "/home/"
-          volumeMounts:
-            - name: persistent-storage
-              mountPath: "/home"
-              subPath: "home"
-      restartPolicy: Never
-      nodeSelector:
-        NodeGroupType: default
-        NodePool: default
-        hub.jupyter.org/node-purpose: user
-      tolerations:
-        - key: "hub.jupyter.org/dedicated"
-          operator: "Equal"
-          value: "user"
-          effect: "NoSchedule"
-      volumes:
-        - name: persistent-storage
-          persistentVolumeClaim:
-            claimName: efs-persist
diff --git a/.github/manifests/hello-world-pod.yaml b/.github/manifests/hello-world-pod.yaml
deleted file mode 100644
index 1977f336..00000000
--- a/.github/manifests/hello-world-pod.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-# manifests/hello-world-pod.yaml
-apiVersion: v1
-kind: Pod
-metadata:
-  name: hello-world-pod
-spec:
-  containers:
-  - name: hello
-    image: busybox
-    command: ['sh', '-c', 'echo Hello, World! && sleep 30']
-  nodeSelector:
-    NodeGroupType: default
-    NodePool: default
-    hub.jupyter.org/node-purpose: user
-  tolerations:
-  - key: "hub.jupyter.org/dedicated"
-    operator: "Equal"
-    value: "user"
-    effect: "NoSchedule"
-
diff --git a/.github/workflows/report.yaml b/.github/workflows/report.yaml
deleted file mode 100644
index 84e1356d..00000000
--- a/.github/workflows/report.yaml
+++ /dev/null
@@ -1,183 +0,0 @@
-# ---
-# name: Generate Data Usage Report
-#
-# on:
-#   pull_request:
-#     branches:
-#       - main
-#
-#
-# jobs:
-#   generate-jobs-usage-report:
-#     runs-on: ubuntu-latest
-#
-#     steps:
-#       - name: Configure AWS Credentials
-#         uses: aws-actions/configure-aws-credentials@v3
-#         with:
-#           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-#           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-#           aws-region: us-east-2
-#
-#       - name: Launch EC2 Instance
-#         id: launch_ec2
-#         run: |
-#           INSTANCE_ID=$(aws ec2 run-instances \
-#             --image-id ami-088d38b423bff245f \
-#             --count 1 \
-#             --instance-type t3.micro \
-#             --key-name dandihub-gh-actions \
-#             --security-group-ids sg-0bf2dc1c2ff9c122e \
-#             --subnet-id subnet-0f544cca61ccd2804 \
-#             --tag-specifications "ResourceType=instance,Tags=[{Key=Name,Value=dandihub-gh-actions}]" \
-#             --query 'Instances[0].InstanceId' --output text)
-#
-#           echo "INSTANCE_ID=${INSTANCE_ID}" >> $GITHUB_ENV
-#
-#       - name: Wait for EC2 to Initialize
-#         run: |
-#           aws ec2 wait instance-status-ok --instance-ids ${{ env.INSTANCE_ID }}
-#
-#       - name: Allocate Elastic IP
-#         id: allocate_eip
-#         run: |
-#           ALLOC_ID=$(aws ec2 allocate-address --query 'AllocationId' --output text)
-#           echo "ALLOC_ID=${ALLOC_ID}" >> $GITHUB_ENV
-#
-#       - name: Associate Elastic IP with EC2 Instance
-#         id: associate_eip
-#         run: |
-#           EIP=$(aws ec2 associate-address \
-#             --instance-id ${{ env.INSTANCE_ID }} \
-#             --allocation-id ${{ env.ALLOC_ID }} \
-#             --query 'AssociationId' --output text)
-#           echo "EIP=${EIP}" >> $GITHUB_ENV
-#
-#       - name: Retrieve Elastic IP Address
-#         id: get_ip
-#         run: |
-#           PUBLIC_IP=$(aws ec2 describe-addresses \
-#             --allocation-ids ${{ env.ALLOC_ID }} \
-#             --query 'Addresses[0].PublicIp' --output text)
-#           echo "PUBLIC_IP=${PUBLIC_IP}"
-#           echo "PUBLIC_IP=${PUBLIC_IP}" >> $GITHUB_ENV
-#
-#       - name: Execute df Command on EC2
-#         uses: appleboy/ssh-action@v0.1.6
-#         with:
-#           host: ${{ env.PUBLIC_IP }}
-#           username: ec2-user
-#           key: ${{ secrets.EC2_SSH_KEY }}
-#           script: |
-#             echo "Running df command on EC2 instance..."
-#             df -h
-#             echo "Command completed."
-#         continue-on-error: true  # Allow the workflow to continue even if this step fails
-#
-#
-#       - name: Terminate EC2 Instance
-#         run: |
-#           aws ec2 terminate-instances --instance-ids ${{ env.INSTANCE_ID }}
-#           aws ec2 wait instance-terminated --instance-ids ${{ env.INSTANCE_ID }}
-#         continue-on-error: true  # Allow the workflow to continue even if this step fails
-#
-#       - name: Release Elastic IP
-#         run: |
-#           aws ec2 release-address --allocation-id ${{ env.ALLOC_ID }}
-#         continue-on-error: true  # Allow the workflow to continue even if this step fails
-#
-# # jobs:
-# #   generate_data_usage_report:
-# #     runs-on: ubuntu-latest
-# #
-# #     steps:
-# #
-# #       - name: Log in to DockerHub
-# #         uses: docker/login-action@v2
-# #         with:
-# #           username: ${{ secrets.DOCKERHUB_USERNAME }}
-# #           password: ${{ secrets.DOCKERHUB_TOKEN }}
-# #
-# #       - name: Build and push Docker image
-# #         uses: docker/build-push-action@v3
-# #         with:
-# #           context: .
-# #           file: images/Dockerfile.dandihub_report_generator
-# #           push: true
-# #           tags: ${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest
-# #
-# #       - name: Configure AWS credentials
-# #         uses: aws-actions/configure-aws-credentials@v3
-# #         with:
-# #           aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-# #           aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-# #           aws-region: us-east-2
-# #
-# #       - name: Assume ProvisioningRole
-# #         run: |
-# #           CREDS=$(aws sts assume-role --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }} --role-session-name "GitHubActionsSession")
-# #           export AWS_ACCESS_KEY_ID=$(echo $CREDS | jq -r '.Credentials.AccessKeyId')
-# #           export AWS_SECRET_ACCESS_KEY=$(echo $CREDS | jq -r '.Credentials.SecretAccessKey')
-# #           export AWS_SESSION_TOKEN=$(echo $CREDS | jq -r '.Credentials.SessionToken')
-# #
-# #       - name: Configure kubectl with AWS EKS
-# #         run: |
-# #           aws eks update-kubeconfig --name eks-dandihub --region us-east-2 --role-arn ${{ secrets.AWS_PROVISIONING_ROLE_ARN }}
-# #
-# #       # TODO remove
-# #       - name: Sanity check
-# #         run: |
-# #           kubectl get pods -n jupyterhub
-# #
-# #       # - name: Deploy Hello World Pod
-# #       #   run: |
-# #       #     kubectl apply -f .github/manifests/hello-world-pod.yaml
-# #       #
-# #       # - name: Wait for Hello World Pod to complete
-# #       #   run: |
-# #       #     kubectl wait --for=condition=Ready pod/hello-world-pod --timeout=300s  # 5 minutes
-# #       #   continue-on-error: true  # Allow the workflow to continue even if this step fails
-# #       #
-# #       # - name: Get Hello World Pod logs
-# #       #   run: |
-# #       #     kubectl logs hello-world-pod
-# #       #   if: ${{ success() }}  # Only run this step if the previous step was successful
-# #       #
-# #       # - name: Delete Hello World Pod
-# #       #   run: |
-# #       #     kubectl delete pod hello-world-pod
-# #       #   if: ${{ always() }}  # Always run this step, even if other steps fail
-# #       #
-# #       - name: Replace image placeholder in manifest
-# #         run: |
-# #           sed -i 's|IMAGE_PLACEHOLDER|'"${{ secrets.DOCKERHUB_USERNAME }}/dandihub-report-generator:latest"'|' .github/manifests/disk-usage-report-job.yaml
-# #
-# #       - name: Deploy Disk Usage Report Job
-# #         run: |
-# #           kubectl apply -f .github/manifests/disk-usage-report-job.yaml
-# #
-# #       # TODO should timeout be longer?
-# #       - name: Wait for Disk Usage Report Job to complete
-# #         run: |
-# #           kubectl wait --for=condition=complete job/disk-usage-report-job --timeout=360s -n jupyterhub
-# #         continue-on-error: true
-# #
-# #       # continue-on-error for previous steps so we delete the job
-# #       - name: Delete Disk Usage Report Job
-# #         run: |
-# #           kubectl delete job disk-usage-report-job -n jupyterhub
-# #
-# #       # - name: Clone dandi-hub-usage-reports repository
-# #       #   run: |
-# #       #     git clone https://github.com/dandi/dandi-hub-usage-reports.git
-# #       #
-# #       # - name: Copy report file to repository, commit and push report
-# #       #   run: |
-# #       #     cd dandi-hub-usage-reports
-# #       #     DATE=$(date +'%Y-%m-%d')
-# #       #     mv ../du_report.json $DATE_du_report.json
-# #       #     git config --global user.name "GitHub Actions"
-# #       #     git config --global user.email "actions@github.com"
-# #       #     git add $DATE_du_report.json
-# #       #     git commit -m "Add disk usage report for $DATE"
-# #       #     git push https://${{ secrets.GITHUB_TOKEN }}@github.com/dandi/dandi-hub-usage-reports.git