Skip to content

Commit

Permalink
added correct endpoints and file type detection
Browse files Browse the repository at this point in the history
  • Loading branch information
christianlouis committed Feb 19, 2025
1 parent c40e9f0 commit 8848fec
Showing 1 changed file with 40 additions and 7 deletions.
47 changes: 40 additions & 7 deletions app/tasks/convert_to_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import requests
import logging
import mimetypes
from celery import shared_task
from app.config import settings
from app.tasks.upload_to_s3 import upload_to_s3
Expand All @@ -12,25 +13,57 @@
def convert_to_pdf(file_path):
"""
Converts a file to PDF using Gotenberg's API.
Determines the appropriate Gotenberg endpoint based on the file's MIME type.
On success, saves the PDF locally and enqueues it for S3 upload.
"""
# Ensure that settings contain the Gotenberg URL (e.g., "http://gotenberg:3000")
gotenberg_url = getattr(settings, "gotenberg_url", None)
if not gotenberg_url:
logger.error("Gotenberg URL is not configured in settings.")
return

# Try to guess the MIME type based on file content (using extension-based fallback)
mime_type, encoding = mimetypes.guess_type(file_path)
logger.info(f"Guessed MIME type for '{file_path}' is: {mime_type}")

endpoint = None
form_key = "files" # Default form key for most endpoints

if mime_type:
if mime_type == "text/html":
endpoint = f"{gotenberg_url}/forms/chromium/convert/html"
# The Chromium HTML endpoint expects the HTML file to be provided under the key "index.html"
form_key = "index.html"
elif mime_type.startswith("image/"):
# For images, we use the LibreOffice endpoint (which supports image conversion)
endpoint = f"{gotenberg_url}/forms/libreoffice/convert"
elif mime_type.startswith("text/plain"):
endpoint = f"{gotenberg_url}/forms/libreoffice/convert"
elif mime_type in ["text/markdown", "text/x-markdown"]:
# Optionally, you could use the Chromium markdown endpoint if you have an HTML wrapper.
# For now, we'll fallback to LibreOffice.
endpoint = f"{gotenberg_url}/forms/libreoffice/convert"
else:
# For all other MIME types (e.g. Office documents), use the LibreOffice endpoint.
endpoint = f"{gotenberg_url}/forms/libreoffice/convert"
else:
# If MIME detection fails, fallback to extension-based detection.
ext = os.path.splitext(file_path)[1].lower()
if ext in [".html", ".htm"]:
endpoint = f"{gotenberg_url}/forms/chromium/convert/html"
form_key = "index.html"
else:
endpoint = f"{gotenberg_url}/forms/libreoffice/convert"

try:
with open(file_path, "rb") as f:
files = {"file": f}
# Adjust the endpoint path if needed.
response = requests.post(f"{gotenberg_url}/convert", files=files)
files = {form_key: f}
response = requests.post(endpoint, files=files)

if response.status_code == 200:
converted_file_path = os.path.splitext(file_path)[0] + ".pdf"
with open(converted_file_path, "wb") as f:
f.write(response.content)
with open(converted_file_path, "wb") as out_file:
out_file.write(response.content)
logger.info(f"Converted file saved as PDF: {converted_file_path}")
# Enqueue the upload of the converted PDF.
upload_to_s3.delay(converted_file_path)
return converted_file_path
else:
Expand Down

0 comments on commit 8848fec

Please sign in to comment.