Skip to content

Commit

Permalink
Create tasks.py
Browse files Browse the repository at this point in the history
  • Loading branch information
christianlouis authored Feb 9, 2025
1 parent 6eac722 commit e16af90
Showing 1 changed file with 27 additions and 0 deletions.
27 changes: 27 additions & 0 deletions app/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/usr/bin/env python3

import boto3
from .celery_worker import celery
from .config import settings

textract_client = boto3.client(
"textract",
aws_access_key_id=settings.aws_access_key_id,
aws_secret_access_key=settings.aws_secret_access_key,
region_name=settings.aws_region,
)

@celery.task
def process_document(file_path: str):
"""Process document: OCR, metadata extraction, upload"""

# Send to AWS Textract
with open(file_path, "rb") as document:
response = textract_client.analyze_document(
Document={"Bytes": document.read()},
FeatureTypes=["TABLES", "FORMS"]
)

extracted_text = " ".join([block["Text"] for block in response["Blocks"] if block["BlockType"] == "WORD"])

return {"file": file_path, "text": extracted_text}

0 comments on commit e16af90

Please sign in to comment.