Skip to content

Commit

Permalink
Added text detection in upload_to_s3
Browse files Browse the repository at this point in the history
  • Loading branch information
christianlouis committed Feb 14, 2025
1 parent e0365e5 commit 40fe288
Showing 1 changed file with 27 additions and 4 deletions.
31 changes: 27 additions & 4 deletions app/tasks/upload_to_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
import uuid
import boto3
import shutil
import fitz # PyMuPDF for checking embedded text
from app.config import settings
from app.tasks.retry_config import BaseTaskWithRetry
from app.tasks.process_with_textract import process_with_textract
from app.tasks.extract_metadata_with_gpt import extract_metadata_with_gpt

# Import the shared Celery instance
from app.celery_app import celery
Expand All @@ -22,8 +24,9 @@
@celery.task(base=BaseTaskWithRetry)
def upload_to_s3(original_local_file: str):
"""
Uploads a file to S3 with a UUID-based filename and triggers Textract processing.
Instead of moving the file, this version copies the file locally.
Uploads a file to S3 with a UUID-based filename and triggers processing.
- If the PDF already contains embedded text, skip Textract and extract text locally.
- Otherwise, upload to S3 and process with Textract.
"""
bucket_name = settings.s3_bucket_name
if not bucket_name:
Expand All @@ -49,15 +52,35 @@ def upload_to_s3(original_local_file: str):
# Copy the file instead of moving it
shutil.copy(original_local_file, new_local_path)

# Check for embedded text
pdf_doc = fitz.open(new_local_path)
has_text = any(page.get_text() for page in pdf_doc)
pdf_doc.close()

if has_text:
print(f"[INFO] PDF {original_local_file} contains embedded text. Skipping Textract.")

# Extract text locally
extracted_text = ""
pdf_doc = fitz.open(new_local_path)
for page in pdf_doc:
extracted_text += page.get_text("text") + "\n"
pdf_doc.close()

# Call metadata extraction directly
extract_metadata_with_gpt.delay(new_filename, extracted_text)

return {"file": new_local_path, "status": "Text extracted locally"}

try:
print(f"[INFO] Uploading {new_local_path} to s3://{bucket_name}/{new_filename}...")
s3_client.upload_file(new_local_path, bucket_name, new_filename)
print(f"[INFO] File uploaded successfully: {new_filename}")

# Trigger Textract processing using the new filename (S3 key)
# Trigger Textract processing if no embedded text was found
process_with_textract.delay(new_filename)

return {"file": new_local_path, "s3_key": new_filename, "status": "Uploaded"}
return {"file": new_local_path, "s3_key": new_filename, "status": "Uploaded to S3 for OCR"}

except Exception as e:
print(f"[ERROR] Failed to upload {new_local_path} to S3: {e}")
Expand Down

0 comments on commit 40fe288

Please sign in to comment.