From 65beea89cf67b6739fcd414f52beccf6355e6b7f Mon Sep 17 00:00:00 2001 From: Christian Krakau-Louis Date: Fri, 14 Feb 2025 22:49:22 +0100 Subject: [PATCH 1/2] First try for Azure API --- .env.demo | 3 + app/config.py | 3 + app/tasks/process_with_textract.py | 143 ++++++++--------------------- requirements.txt | 2 + 4 files changed, 48 insertions(+), 103 deletions(-) diff --git a/.env.demo b/.env.demo index 5998410..ed072d7 100644 --- a/.env.demo +++ b/.env.demo @@ -3,6 +3,8 @@ DATABASE_URL=sqlite:///./app/database.db REDIS_URL=redis://redis:6379/0 WORKDIR=/workdir AWS_REGION="eu-central-1" +AZURE_REGION="eastus" +AZURE_ENDPOINT="https://.cognitiveservices.azure.com/" S3_BUCKET_NAME= NEXTCLOUD_UPLOAD_URL=https://nextcloud.example.com/remote.php/dav/files/ NEXTCLOUD_FOLDER="" @@ -17,6 +19,7 @@ PAPERLESS_NGX_API_TOKEN= DROPBOX_APP_KEY= DROPBOX_APP_SECRET= DROPBOX_REFRESH_TOKEN= +AZURE_AI_KEY= # **User Credentials** ADMIN_USERNAME=admin diff --git a/app/config.py b/app/config.py index d196b9b..146a757 100644 --- a/app/config.py +++ b/app/config.py @@ -24,6 +24,9 @@ class Settings(BaseSettings): nextcloud_folder: str paperless_ngx_api_token: str paperless_host: str + azure_ai_key: str + azure_region: str + azure_endpoint: str # IMAP 1 imap1_host: Optional[str] = None diff --git a/app/tasks/process_with_textract.py b/app/tasks/process_with_textract.py index 5e35376..ee100ee 100644 --- a/app/tasks/process_with_textract.py +++ b/app/tasks/process_with_textract.py @@ -1,8 +1,8 @@ -import time import os -import boto3 -import fitz # PyMuPDF import logging +from azure.core.credentials import AzureKeyCredential +from azure.ai.documentintelligence import DocumentIntelligenceClient +from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult from app.config import settings from app.tasks.retry_config import BaseTaskWithRetry @@ -11,120 +11,57 @@ logger = logging.getLogger(__name__) -# Initialize AWS clients using settings. -s3_client = boto3.client( - "s3", - aws_access_key_id=settings.aws_access_key_id, - aws_secret_access_key=settings.aws_secret_access_key, - region_name=settings.aws_region, +# Initialize Azure Document Intelligence client +document_intelligence_client = DocumentIntelligenceClient( + endpoint=settings.azure_endpoint, + credential=AzureKeyCredential(settings.azure_ai_key) ) -textract_client = boto3.client( - "textract", - aws_access_key_id=settings.aws_access_key_id, - aws_secret_access_key=settings.aws_secret_access_key, - region_name=settings.aws_region, -) - -BUCKET_NAME = settings.s3_bucket_name - -def create_searchable_pdf(tmp_file_path, extracted_pages): - """ - Opens the PDF at tmp_file_path, overlays invisible OCR text using the - Textract bounding box data (extracted_pages), and overwrites the same file. - - extracted_pages: list of pages; each page is a list of (text, bbox) tuples. - """ - pdf_doc = fitz.open(tmp_file_path) - try: - for page_num, page in enumerate(pdf_doc): - if page_num < len(extracted_pages): - for line, bbox in extracted_pages[page_num]: - # Convert relative bbox to absolute coordinates. - rect = fitz.Rect( - bbox['Left'] * page.rect.width, - bbox['Top'] * page.rect.height, - (bbox['Left'] + bbox['Width']) * page.rect.width, - (bbox['Top'] + bbox['Height']) * page.rect.height, - ) - page.insert_text( - rect.bl, # starting at the bottom-left of the bbox - line, - fontsize=12, # adjust as needed - fontname="helv", # Helvetica - color=(1, 1, 1, 0), # transparent - render_mode=3 # invisible but searchable text - ) - # Overwrite the same file. - pdf_doc.save(tmp_file_path, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP) - logger.info(f"Overwritten tmp file with OCR overlay: {tmp_file_path}") - finally: - pdf_doc.close() @celery.task(base=BaseTaskWithRetry) def process_with_textract(s3_filename: str): """ - Processes a PDF document using Textract and overlays invisible OCR text onto - the local temporary file (already stored under /tmp). + Processes a PDF document using Azure Document Intelligence and overlays OCR text onto + the local temporary file (stored under /tmp). Steps: - 1. Start a Textract text detection job. - 2. Poll until the job succeeds and organize the Textract Blocks into pages - (each page is a list of (text, bounding-box) tuples). - 3. Use the local tmp file at /tmp/ to add the OCR overlay. - 4. Delete the S3 object. - 5. Trigger downstream metadata extraction by calling extract_metadata_with_gpt. + 1. Uploads the document for OCR using Azure Document Intelligence. + 2. Retrieves the processed PDF with embedded text. + 3. Saves the OCR-processed PDF locally in the same location as before. + 4. Extracts the text content for metadata processing. + 5. Triggers downstream metadata extraction by calling extract_metadata_with_gpt. """ try: - logger.info(f"Starting Textract job for {s3_filename}") - response = textract_client.start_document_text_detection( - DocumentLocation={"S3Object": {"Bucket": BUCKET_NAME, "Name": s3_filename}} - ) - job_id = response["JobId"] - logger.info(f"Textract job started, JobId: {job_id}") - - # Process Textract Blocks into pages. - extracted_pages = [] - current_page_lines = [] - while True: - result = textract_client.get_document_text_detection(JobId=job_id) - status = result["JobStatus"] - if status == "SUCCEEDED": - logger.info("Textract job succeeded.") - for block in result["Blocks"]: - if block["BlockType"] == "PAGE": - if current_page_lines: - extracted_pages.append(current_page_lines) - current_page_lines = [] - elif block["BlockType"] == "LINE": - bbox = block["Geometry"]["BoundingBox"] - current_page_lines.append((block["Text"], bbox)) - if current_page_lines: - extracted_pages.append(current_page_lines) - break - elif status in ["FAILED", "PARTIAL_SUCCESS"]: - logger.error("Textract job failed.") - raise Exception("Textract job failed") - time.sleep(3) - - # Use the local tmp file located under the workdir configuration. tmp_file_path = os.path.join(settings.workdir, "tmp", s3_filename) if not os.path.exists(tmp_file_path): - raise Exception(f"Local file not found: {tmp_file_path}") - logger.info(f"Processing local file {tmp_file_path} with OCR overlay.") + raise FileNotFoundError(f"Local file not found: {tmp_file_path}") - # Overwrite the tmp file with the added OCR overlay. - create_searchable_pdf(tmp_file_path, extracted_pages) + logger.info(f"Processing {s3_filename} with Azure Document Intelligence OCR.") + + # Open and send the document for processing + with open(tmp_file_path, "rb") as f: + poller = document_intelligence_client.begin_analyze_document( + "prebuilt-read", body=f, output=[AnalyzeOutputOption.PDF] + ) + result: AnalyzeResult = poller.result() + operation_id = poller.details["operation_id"] + + # Retrieve the processed searchable PDF + response = document_intelligence_client.get_analyze_result_pdf( + model_id=result.model_id, result_id=operation_id + ) + searchable_pdf_path = tmp_file_path # Overwrite the original PDF location + with open(searchable_pdf_path, "wb") as writer: + writer.writelines(response) + logger.info(f"Searchable PDF saved at: {searchable_pdf_path}") - # Delete the S3 object. - logger.info(f"Deleting {s3_filename} from S3") - s3_client.delete_object(Bucket=BUCKET_NAME, Key=s3_filename) + # Extract raw text content from the result + extracted_text = result.content if result.content else "" + logger.info(f"Extracted text for {s3_filename}: {len(extracted_text)} characters") - # Concatenate extracted text. - cleaned_text = " ".join([line for page in extracted_pages for line, _ in page]) - # Trigger downstream metadata extraction. - extract_metadata_with_gpt.delay(s3_filename, cleaned_text) + # Trigger downstream metadata extraction + extract_metadata_with_gpt.delay(s3_filename, extracted_text) - return {"s3_file": s3_filename, "searchable_pdf": tmp_file_path, "cleaned_text": cleaned_text} + return {"s3_file": s3_filename, "searchable_pdf": searchable_pdf_path, "cleaned_text": extracted_text} except Exception as e: - logger.error(f"Error processing {s3_filename}: {e}") + logger.error(f"Error processing {s3_filename} with Azure Document Intelligence: {e}") raise diff --git a/requirements.txt b/requirements.txt index db69b66..79d0a3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ openai pymupdf requests dropbox +azure-core-credentials +azure-ai-documentintelligence \ No newline at end of file From 9ebac992e6ad7817b41a539d4cf81470330b38c9 Mon Sep 17 00:00:00 2001 From: Christian Krakau-Louis Date: Fri, 14 Feb 2025 22:53:03 +0100 Subject: [PATCH 2/2] changed required.txt --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 79d0a3b..ae4fac1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,4 @@ openai pymupdf requests dropbox -azure-core-credentials azure-ai-documentintelligence \ No newline at end of file