Merge branch 'main' of https://github.com/christianlouis/document-pro…

…cessor
christianlouis · Feb 14, 2025 · c0c5ca3 · c0c5ca3
2 parents 17fc8fa + 9156c62
commit c0c5ca3
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 103 deletions.
diff --git a/.env.demo b/.env.demo
@@ -3,6 +3,8 @@ DATABASE_URL=sqlite:///./app/database.db
 REDIS_URL=redis://redis:6379/0
 WORKDIR=/workdir
 AWS_REGION="eu-central-1"
+AZURE_REGION="eastus"
+AZURE_ENDPOINT="https://<yourendpoint>.cognitiveservices.azure.com/"
 S3_BUCKET_NAME=<your_bucket_name>
 NEXTCLOUD_UPLOAD_URL=https://nextcloud.example.com/remote.php/dav/files/<USERNAME>
 NEXTCLOUD_FOLDER="<NEXTCLOUD_FOLDER_PATH>"
@@ -17,6 +19,7 @@ PAPERLESS_NGX_API_TOKEN=<PAPERLESS_API_TOKEN>
 DROPBOX_APP_KEY=<DROPBOX_APP_KEY>
 DROPBOX_APP_SECRET=<DROPBOX_APP_SECRET>
 DROPBOX_REFRESH_TOKEN=<DROPBOX_REFRESH_TOKEN>
+AZURE_AI_KEY=<AZURE_AI_KEY>
 
 # **User Credentials**
 ADMIN_USERNAME=admin

diff --git a/app/config.py b/app/config.py
@@ -24,6 +24,9 @@ class Settings(BaseSettings):
     nextcloud_folder: str
     paperless_ngx_api_token: str
     paperless_host: str
+    azure_ai_key: str
+    azure_region: str
+    azure_endpoint: str
 
     # IMAP 1
     imap1_host: Optional[str] = None

diff --git a/app/tasks/process_with_textract.py b/app/tasks/process_with_textract.py
@@ -1,8 +1,8 @@
-import time
 import os
-import boto3
-import fitz  # PyMuPDF
 import logging
+from azure.core.credentials import AzureKeyCredential
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult
 
 from app.config import settings
 from app.tasks.retry_config import BaseTaskWithRetry
@@ -11,120 +11,57 @@
 
 logger = logging.getLogger(__name__)
 
-# Initialize AWS clients using settings.
-s3_client = boto3.client(
-    "s3",
-    aws_access_key_id=settings.aws_access_key_id,
-    aws_secret_access_key=settings.aws_secret_access_key,
-    region_name=settings.aws_region,
+# Initialize Azure Document Intelligence client
+document_intelligence_client = DocumentIntelligenceClient(
+    endpoint=settings.azure_endpoint,
+    credential=AzureKeyCredential(settings.azure_ai_key)
 )
-textract_client = boto3.client(
-    "textract",
-    aws_access_key_id=settings.aws_access_key_id,
-    aws_secret_access_key=settings.aws_secret_access_key,
-    region_name=settings.aws_region,
-)
-
-BUCKET_NAME = settings.s3_bucket_name
-
-def create_searchable_pdf(tmp_file_path, extracted_pages):
-    """
-    Opens the PDF at tmp_file_path, overlays invisible OCR text using the
-    Textract bounding box data (extracted_pages), and overwrites the same file.
-    
-    extracted_pages: list of pages; each page is a list of (text, bbox) tuples.
-    """
-    pdf_doc = fitz.open(tmp_file_path)
-    try:
-        for page_num, page in enumerate(pdf_doc):
-            if page_num < len(extracted_pages):
-                for line, bbox in extracted_pages[page_num]:
-                    # Convert relative bbox to absolute coordinates.
-                    rect = fitz.Rect(
-                        bbox['Left'] * page.rect.width,
-                        bbox['Top'] * page.rect.height,
-                        (bbox['Left'] + bbox['Width']) * page.rect.width,
-                        (bbox['Top'] + bbox['Height']) * page.rect.height,
-                    )
-                    page.insert_text(
-                        rect.bl,          # starting at the bottom-left of the bbox
-                        line,
-                        fontsize=12,      # adjust as needed
-                        fontname="helv",  # Helvetica
-                        color=(1, 1, 1, 0),  # transparent
-                        render_mode=3     # invisible but searchable text
-                    )
-        # Overwrite the same file.
-        pdf_doc.save(tmp_file_path, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP)
-        logger.info(f"Overwritten tmp file with OCR overlay: {tmp_file_path}")
-    finally:
-        pdf_doc.close()
 
 @celery.task(base=BaseTaskWithRetry)
 def process_with_textract(s3_filename: str):
     """
-    Processes a PDF document using Textract and overlays invisible OCR text onto
-    the local temporary file (already stored under <workdir>/tmp).
+    Processes a PDF document using Azure Document Intelligence and overlays OCR text onto
+    the local temporary file (stored under <workdir>/tmp).
     
     Steps:
-      1. Start a Textract text detection job.
-      2. Poll until the job succeeds and organize the Textract Blocks into pages
-         (each page is a list of (text, bounding-box) tuples).
-      3. Use the local tmp file at <workdir>/tmp/<s3_filename> to add the OCR overlay.
-      4. Delete the S3 object.
-      5. Trigger downstream metadata extraction by calling extract_metadata_with_gpt.
+      1. Uploads the document for OCR using Azure Document Intelligence.
+      2. Retrieves the processed PDF with embedded text.
+      3. Saves the OCR-processed PDF locally in the same location as before.
+      4. Extracts the text content for metadata processing.
+      5. Triggers downstream metadata extraction by calling extract_metadata_with_gpt.
     """
     try:
-        logger.info(f"Starting Textract job for {s3_filename}")
-        response = textract_client.start_document_text_detection(
-            DocumentLocation={"S3Object": {"Bucket": BUCKET_NAME, "Name": s3_filename}}
-        )
-        job_id = response["JobId"]
-        logger.info(f"Textract job started, JobId: {job_id}")
-
-        # Process Textract Blocks into pages.
-        extracted_pages = []
-        current_page_lines = []
-        while True:
-            result = textract_client.get_document_text_detection(JobId=job_id)
-            status = result["JobStatus"]
-            if status == "SUCCEEDED":
-                logger.info("Textract job succeeded.")
-                for block in result["Blocks"]:
-                    if block["BlockType"] == "PAGE":
-                        if current_page_lines:
-                            extracted_pages.append(current_page_lines)
-                            current_page_lines = []
-                    elif block["BlockType"] == "LINE":
-                        bbox = block["Geometry"]["BoundingBox"]
-                        current_page_lines.append((block["Text"], bbox))
-                if current_page_lines:
-                    extracted_pages.append(current_page_lines)
-                break
-            elif status in ["FAILED", "PARTIAL_SUCCESS"]:
-                logger.error("Textract job failed.")
-                raise Exception("Textract job failed")
-            time.sleep(3)
-
-        # Use the local tmp file located under the workdir configuration.
         tmp_file_path = os.path.join(settings.workdir, "tmp", s3_filename)
         if not os.path.exists(tmp_file_path):
-            raise Exception(f"Local file not found: {tmp_file_path}")
-        logger.info(f"Processing local file {tmp_file_path} with OCR overlay.")
+            raise FileNotFoundError(f"Local file not found: {tmp_file_path}")
 
-        # Overwrite the tmp file with the added OCR overlay.
-        create_searchable_pdf(tmp_file_path, extracted_pages)
+        logger.info(f"Processing {s3_filename} with Azure Document Intelligence OCR.")
+
+        # Open and send the document for processing
+        with open(tmp_file_path, "rb") as f:
+            poller = document_intelligence_client.begin_analyze_document(
+                "prebuilt-read", body=f, output=[AnalyzeOutputOption.PDF]
+            )
+        result: AnalyzeResult = poller.result()
+        operation_id = poller.details["operation_id"]
+
+        # Retrieve the processed searchable PDF
+        response = document_intelligence_client.get_analyze_result_pdf(
+            model_id=result.model_id, result_id=operation_id
+        )
+        searchable_pdf_path = tmp_file_path  # Overwrite the original PDF location
+        with open(searchable_pdf_path, "wb") as writer:
+            writer.writelines(response)
+        logger.info(f"Searchable PDF saved at: {searchable_pdf_path}")
 
-        # Delete the S3 object.
-        logger.info(f"Deleting {s3_filename} from S3")
-        s3_client.delete_object(Bucket=BUCKET_NAME, Key=s3_filename)
+        # Extract raw text content from the result
+        extracted_text = result.content if result.content else ""
+        logger.info(f"Extracted text for {s3_filename}: {len(extracted_text)} characters")
 
-        # Concatenate extracted text.
-        cleaned_text = " ".join([line for page in extracted_pages for line, _ in page])
-        # Trigger downstream metadata extraction.
-        extract_metadata_with_gpt.delay(s3_filename, cleaned_text)
+        # Trigger downstream metadata extraction
+        extract_metadata_with_gpt.delay(s3_filename, extracted_text)
 
-        return {"s3_file": s3_filename, "searchable_pdf": tmp_file_path, "cleaned_text": cleaned_text}
+        return {"s3_file": s3_filename, "searchable_pdf": searchable_pdf_path, "cleaned_text": extracted_text}
     except Exception as e:
-        logger.error(f"Error processing {s3_filename}: {e}")
+        logger.error(f"Error processing {s3_filename} with Azure Document Intelligence: {e}")
         raise
diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ openai
 pymupdf
 requests
 dropbox
+azure-ai-documentintelligence
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ openai @@
     pymupdf
     requests
     dropbox
+    azure-ai-documentintelligence