From 40fe28823048a34ee25ac527b26c8d3715dd5d29 Mon Sep 17 00:00:00 2001
From: Christian Krakau-Louis <christianlouis@gmail.com>
Date: Fri, 14 Feb 2025 12:00:44 +0100
Subject: [PATCH] Added text detection in upload_to_s3

---
 app/tasks/upload_to_s3.py | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/app/tasks/upload_to_s3.py b/app/tasks/upload_to_s3.py
index 8f97f6e..435331b 100644
--- a/app/tasks/upload_to_s3.py
+++ b/app/tasks/upload_to_s3.py
@@ -4,9 +4,11 @@
 import uuid
 import boto3
 import shutil
+import fitz  # PyMuPDF for checking embedded text
 from app.config import settings
 from app.tasks.retry_config import BaseTaskWithRetry
 from app.tasks.process_with_textract import process_with_textract
+from app.tasks.extract_metadata_with_gpt import extract_metadata_with_gpt
 
 # Import the shared Celery instance
 from app.celery_app import celery
@@ -22,8 +24,9 @@
 @celery.task(base=BaseTaskWithRetry)
 def upload_to_s3(original_local_file: str):
     """
-    Uploads a file to S3 with a UUID-based filename and triggers Textract processing.
-    Instead of moving the file, this version copies the file locally.
+    Uploads a file to S3 with a UUID-based filename and triggers processing.
+    - If the PDF already contains embedded text, skip Textract and extract text locally.
+    - Otherwise, upload to S3 and process with Textract.
     """
     bucket_name = settings.s3_bucket_name
     if not bucket_name:
@@ -49,15 +52,35 @@ def upload_to_s3(original_local_file: str):
     # Copy the file instead of moving it
     shutil.copy(original_local_file, new_local_path)
 
+    # Check for embedded text
+    pdf_doc = fitz.open(new_local_path)
+    has_text = any(page.get_text() for page in pdf_doc)
+    pdf_doc.close()
+
+    if has_text:
+        print(f"[INFO] PDF {original_local_file} contains embedded text. Skipping Textract.")
+
+        # Extract text locally
+        extracted_text = ""
+        pdf_doc = fitz.open(new_local_path)
+        for page in pdf_doc:
+            extracted_text += page.get_text("text") + "\n"
+        pdf_doc.close()
+
+        # Call metadata extraction directly
+        extract_metadata_with_gpt.delay(new_filename, extracted_text)
+
+        return {"file": new_local_path, "status": "Text extracted locally"}
+
     try:
         print(f"[INFO] Uploading {new_local_path} to s3://{bucket_name}/{new_filename}...")
         s3_client.upload_file(new_local_path, bucket_name, new_filename)
         print(f"[INFO] File uploaded successfully: {new_filename}")
 
-        # Trigger Textract processing using the new filename (S3 key)
+        # Trigger Textract processing if no embedded text was found
         process_with_textract.delay(new_filename)
 
-        return {"file": new_local_path, "s3_key": new_filename, "status": "Uploaded"}
+        return {"file": new_local_path, "s3_key": new_filename, "status": "Uploaded to S3 for OCR"}
 
     except Exception as e:
         print(f"[ERROR] Failed to upload {new_local_path} to S3: {e}")