From 40fe28823048a34ee25ac527b26c8d3715dd5d29 Mon Sep 17 00:00:00 2001 From: Christian Krakau-Louis Date: Fri, 14 Feb 2025 12:00:44 +0100 Subject: [PATCH] Added text detection in upload_to_s3 --- app/tasks/upload_to_s3.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/app/tasks/upload_to_s3.py b/app/tasks/upload_to_s3.py index 8f97f6e..435331b 100644 --- a/app/tasks/upload_to_s3.py +++ b/app/tasks/upload_to_s3.py @@ -4,9 +4,11 @@ import uuid import boto3 import shutil +import fitz # PyMuPDF for checking embedded text from app.config import settings from app.tasks.retry_config import BaseTaskWithRetry from app.tasks.process_with_textract import process_with_textract +from app.tasks.extract_metadata_with_gpt import extract_metadata_with_gpt # Import the shared Celery instance from app.celery_app import celery @@ -22,8 +24,9 @@ @celery.task(base=BaseTaskWithRetry) def upload_to_s3(original_local_file: str): """ - Uploads a file to S3 with a UUID-based filename and triggers Textract processing. - Instead of moving the file, this version copies the file locally. + Uploads a file to S3 with a UUID-based filename and triggers processing. + - If the PDF already contains embedded text, skip Textract and extract text locally. + - Otherwise, upload to S3 and process with Textract. """ bucket_name = settings.s3_bucket_name if not bucket_name: @@ -49,15 +52,35 @@ def upload_to_s3(original_local_file: str): # Copy the file instead of moving it shutil.copy(original_local_file, new_local_path) + # Check for embedded text + pdf_doc = fitz.open(new_local_path) + has_text = any(page.get_text() for page in pdf_doc) + pdf_doc.close() + + if has_text: + print(f"[INFO] PDF {original_local_file} contains embedded text. Skipping Textract.") + + # Extract text locally + extracted_text = "" + pdf_doc = fitz.open(new_local_path) + for page in pdf_doc: + extracted_text += page.get_text("text") + "\n" + pdf_doc.close() + + # Call metadata extraction directly + extract_metadata_with_gpt.delay(new_filename, extracted_text) + + return {"file": new_local_path, "status": "Text extracted locally"} + try: print(f"[INFO] Uploading {new_local_path} to s3://{bucket_name}/{new_filename}...") s3_client.upload_file(new_local_path, bucket_name, new_filename) print(f"[INFO] File uploaded successfully: {new_filename}") - # Trigger Textract processing using the new filename (S3 key) + # Trigger Textract processing if no embedded text was found process_with_textract.delay(new_filename) - return {"file": new_local_path, "s3_key": new_filename, "status": "Uploaded"} + return {"file": new_local_path, "s3_key": new_filename, "status": "Uploaded to S3 for OCR"} except Exception as e: print(f"[ERROR] Failed to upload {new_local_path} to S3: {e}")