Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
christianlouis committed Feb 14, 2025
2 parents 17fc8fa + 9156c62 commit c0c5ca3
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 103 deletions.
3 changes: 3 additions & 0 deletions .env.demo
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ DATABASE_URL=sqlite:///./app/database.db
REDIS_URL=redis://redis:6379/0
WORKDIR=/workdir
AWS_REGION="eu-central-1"
AZURE_REGION="eastus"
AZURE_ENDPOINT="https://<yourendpoint>.cognitiveservices.azure.com/"
S3_BUCKET_NAME=<your_bucket_name>
NEXTCLOUD_UPLOAD_URL=https://nextcloud.example.com/remote.php/dav/files/<USERNAME>
NEXTCLOUD_FOLDER="<NEXTCLOUD_FOLDER_PATH>"
Expand All @@ -17,6 +19,7 @@ PAPERLESS_NGX_API_TOKEN=<PAPERLESS_API_TOKEN>
DROPBOX_APP_KEY=<DROPBOX_APP_KEY>
DROPBOX_APP_SECRET=<DROPBOX_APP_SECRET>
DROPBOX_REFRESH_TOKEN=<DROPBOX_REFRESH_TOKEN>
AZURE_AI_KEY=<AZURE_AI_KEY>

# **User Credentials**
ADMIN_USERNAME=admin
Expand Down
3 changes: 3 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ class Settings(BaseSettings):
nextcloud_folder: str
paperless_ngx_api_token: str
paperless_host: str
azure_ai_key: str
azure_region: str
azure_endpoint: str

# IMAP 1
imap1_host: Optional[str] = None
Expand Down
143 changes: 40 additions & 103 deletions app/tasks/process_with_textract.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import time
import os
import boto3
import fitz # PyMuPDF
import logging
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult

from app.config import settings
from app.tasks.retry_config import BaseTaskWithRetry
Expand All @@ -11,120 +11,57 @@

logger = logging.getLogger(__name__)

# Initialize AWS clients using settings.
s3_client = boto3.client(
"s3",
aws_access_key_id=settings.aws_access_key_id,
aws_secret_access_key=settings.aws_secret_access_key,
region_name=settings.aws_region,
# Initialize Azure Document Intelligence client
document_intelligence_client = DocumentIntelligenceClient(
endpoint=settings.azure_endpoint,
credential=AzureKeyCredential(settings.azure_ai_key)
)
textract_client = boto3.client(
"textract",
aws_access_key_id=settings.aws_access_key_id,
aws_secret_access_key=settings.aws_secret_access_key,
region_name=settings.aws_region,
)

BUCKET_NAME = settings.s3_bucket_name

def create_searchable_pdf(tmp_file_path, extracted_pages):
"""
Opens the PDF at tmp_file_path, overlays invisible OCR text using the
Textract bounding box data (extracted_pages), and overwrites the same file.
extracted_pages: list of pages; each page is a list of (text, bbox) tuples.
"""
pdf_doc = fitz.open(tmp_file_path)
try:
for page_num, page in enumerate(pdf_doc):
if page_num < len(extracted_pages):
for line, bbox in extracted_pages[page_num]:
# Convert relative bbox to absolute coordinates.
rect = fitz.Rect(
bbox['Left'] * page.rect.width,
bbox['Top'] * page.rect.height,
(bbox['Left'] + bbox['Width']) * page.rect.width,
(bbox['Top'] + bbox['Height']) * page.rect.height,
)
page.insert_text(
rect.bl, # starting at the bottom-left of the bbox
line,
fontsize=12, # adjust as needed
fontname="helv", # Helvetica
color=(1, 1, 1, 0), # transparent
render_mode=3 # invisible but searchable text
)
# Overwrite the same file.
pdf_doc.save(tmp_file_path, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
logger.info(f"Overwritten tmp file with OCR overlay: {tmp_file_path}")
finally:
pdf_doc.close()

@celery.task(base=BaseTaskWithRetry)
def process_with_textract(s3_filename: str):
"""
Processes a PDF document using Textract and overlays invisible OCR text onto
the local temporary file (already stored under <workdir>/tmp).
Processes a PDF document using Azure Document Intelligence and overlays OCR text onto
the local temporary file (stored under <workdir>/tmp).
Steps:
1. Start a Textract text detection job.
2. Poll until the job succeeds and organize the Textract Blocks into pages
(each page is a list of (text, bounding-box) tuples).
3. Use the local tmp file at <workdir>/tmp/<s3_filename> to add the OCR overlay.
4. Delete the S3 object.
5. Trigger downstream metadata extraction by calling extract_metadata_with_gpt.
1. Uploads the document for OCR using Azure Document Intelligence.
2. Retrieves the processed PDF with embedded text.
3. Saves the OCR-processed PDF locally in the same location as before.
4. Extracts the text content for metadata processing.
5. Triggers downstream metadata extraction by calling extract_metadata_with_gpt.
"""
try:
logger.info(f"Starting Textract job for {s3_filename}")
response = textract_client.start_document_text_detection(
DocumentLocation={"S3Object": {"Bucket": BUCKET_NAME, "Name": s3_filename}}
)
job_id = response["JobId"]
logger.info(f"Textract job started, JobId: {job_id}")

# Process Textract Blocks into pages.
extracted_pages = []
current_page_lines = []
while True:
result = textract_client.get_document_text_detection(JobId=job_id)
status = result["JobStatus"]
if status == "SUCCEEDED":
logger.info("Textract job succeeded.")
for block in result["Blocks"]:
if block["BlockType"] == "PAGE":
if current_page_lines:
extracted_pages.append(current_page_lines)
current_page_lines = []
elif block["BlockType"] == "LINE":
bbox = block["Geometry"]["BoundingBox"]
current_page_lines.append((block["Text"], bbox))
if current_page_lines:
extracted_pages.append(current_page_lines)
break
elif status in ["FAILED", "PARTIAL_SUCCESS"]:
logger.error("Textract job failed.")
raise Exception("Textract job failed")
time.sleep(3)

# Use the local tmp file located under the workdir configuration.
tmp_file_path = os.path.join(settings.workdir, "tmp", s3_filename)
if not os.path.exists(tmp_file_path):
raise Exception(f"Local file not found: {tmp_file_path}")
logger.info(f"Processing local file {tmp_file_path} with OCR overlay.")
raise FileNotFoundError(f"Local file not found: {tmp_file_path}")

# Overwrite the tmp file with the added OCR overlay.
create_searchable_pdf(tmp_file_path, extracted_pages)
logger.info(f"Processing {s3_filename} with Azure Document Intelligence OCR.")

# Open and send the document for processing
with open(tmp_file_path, "rb") as f:
poller = document_intelligence_client.begin_analyze_document(
"prebuilt-read", body=f, output=[AnalyzeOutputOption.PDF]
)
result: AnalyzeResult = poller.result()
operation_id = poller.details["operation_id"]

# Retrieve the processed searchable PDF
response = document_intelligence_client.get_analyze_result_pdf(
model_id=result.model_id, result_id=operation_id
)
searchable_pdf_path = tmp_file_path # Overwrite the original PDF location
with open(searchable_pdf_path, "wb") as writer:
writer.writelines(response)
logger.info(f"Searchable PDF saved at: {searchable_pdf_path}")

# Delete the S3 object.
logger.info(f"Deleting {s3_filename} from S3")
s3_client.delete_object(Bucket=BUCKET_NAME, Key=s3_filename)
# Extract raw text content from the result
extracted_text = result.content if result.content else ""
logger.info(f"Extracted text for {s3_filename}: {len(extracted_text)} characters")

# Concatenate extracted text.
cleaned_text = " ".join([line for page in extracted_pages for line, _ in page])
# Trigger downstream metadata extraction.
extract_metadata_with_gpt.delay(s3_filename, cleaned_text)
# Trigger downstream metadata extraction
extract_metadata_with_gpt.delay(s3_filename, extracted_text)

return {"s3_file": s3_filename, "searchable_pdf": tmp_file_path, "cleaned_text": cleaned_text}
return {"s3_file": s3_filename, "searchable_pdf": searchable_pdf_path, "cleaned_text": extracted_text}
except Exception as e:
logger.error(f"Error processing {s3_filename}: {e}")
logger.error(f"Error processing {s3_filename} with Azure Document Intelligence: {e}")
raise
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ openai
pymupdf
requests
dropbox
azure-ai-documentintelligence

0 comments on commit c0c5ca3

Please sign in to comment.