Shorter pull period for mails, added a bit of different handling for …

…paperless
christianlouis · Feb 18, 2025 · de7050a · de7050a
1 parent c0c5ca3
commit de7050a
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 276 deletions.
diff --git a/app/tasks/extract_metadata_with_gpt.py b/app/tasks/extract_metadata_with_gpt.py
@@ -31,21 +31,33 @@ def extract_json_from_text(text):
 
 @celery.task(base=BaseTaskWithRetry)
 def extract_metadata_with_gpt(s3_filename: str, cleaned_text: str):
-    """Uses OpenAI GPT-4o to classify document metadata."""
-    
+    """Uses OpenAI GPT-4o-mini to classify document metadata."""
+
     prompt = f"""
-You are an intelligent document classifier.
-Given the following extracted text from a document, analyze it and return a JSON object with the following fields:
-1. "filename": A machine-readable filename in the format YYYY-MM-DD_DescriptiveTitle (use only letters, numbers, periods, and underscores).
-2. "empfaenger": The recipient, or "Unknown" if not found.
-3. "absender": The sender, or "Unknown" if not found.
-4. "correspondent": A correspondent extracted from the document, or "Unknown".
-5. "kommunikationsart": One of [Behoerdlicher_Brief, Rechnung, Kontoauszug, Vertrag, Quittung, Privater_Brief, Einladung, Gewerbliche_Korrespondenz, Newsletter, Werbung, Sonstiges].
-6. "kommunikationskategorie": One of [Amtliche_Postbehoerdliche_Dokumente, Finanz_und_Vertragsdokumente, Geschaeftliche_Kommunikation, Private_Korrespondenz, Sonstige_Informationen].
-7. "document_type": The document type, or "Unknown".
-8. "tags": A list of additional keywords extracted from the document.
-9. "language": The detected language code (e.g., "DE").
-10. "title": A human-friendly title for the document.
+You are a specialized document analyzer trained to extract structured metadata from documents.
+Your task is to analyze the given text and return a well-structured JSON object.
+
+Extract and return the following fields:
+1. **filename**: Machine-readable filename (YYYY-MM-DD_DescriptiveTitle, use only letters, numbers, periods, and underscores).
+2. **empfaenger**: The recipient, or "Unknown" if not found.
+3. **absender**: The sender, or "Unknown" if not found.
+4. **correspondent**: The entity or company that issued the document (shortest possible name, e.g., "Amazon" instead of "Amazon EU SARL, German branch").
+5. **kommunikationsart**: One of [Behoerdlicher_Brief, Rechnung, Kontoauszug, Vertrag, Quittung, Privater_Brief, Einladung, Gewerbliche_Korrespondenz, Newsletter, Werbung, Sonstiges].
+6. **kommunikationskategorie**: One of [Amtliche_Postbehoerdliche_Dokumente, Finanz_und_Vertragsdokumente, Geschaeftliche_Kommunikation, Private_Korrespondenz, Sonstige_Informationen].
+7. **document_type**: Precise classification (e.g., Invoice, Contract, Information, Unknown).
+8. **tags**: A list of up to 4 relevant thematic keywords.
+9. **language**: Detected document language (ISO 639-1 code, e.g., "de" or "en").
+10. **title**: A human-readable title summarizing the document content.
+11. **confidence_score**: A numeric value (0-100) indicating the confidence level of the extracted metadata.
+12. **reference_number**: Extracted invoice/order/reference number if available.
+13. **monetary_amounts**: A list of key monetary values detected in the document.
+
+### Important Rules:
+- **OCR Correction**: Assume the text has been corrected for OCR errors.
+- **Tagging**: Max 4 tags, avoiding generic or overly specific terms.
+- **Title**: Concise, no addresses, and contains key identifying features.
+- **Date Selection**: Use the most relevant date if multiple are found.
+- **Output Language**: Maintain the document's original language.
 
 Extracted text:
 {cleaned_text}
@@ -56,7 +68,7 @@ def extract_metadata_with_gpt(s3_filename: str, cleaned_text: str):
     try:
         print(f"[DEBUG] Sending classification request for {s3_filename}...")
         completion = client.chat.completions.create(
-            model="gpt-4o",
+            model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": "You are an intelligent document classifier."},
                 {"role": "user", "content": prompt}
@@ -83,4 +95,3 @@ def extract_metadata_with_gpt(s3_filename: str, cleaned_text: str):
     except Exception as e:
         print(f"[ERROR] OpenAI classification failed for {s3_filename}: {e}")
         return {}
-
diff --git a/app/tasks/imap_tasks.py b/app/tasks/imap_tasks.py
@@ -114,7 +114,7 @@ def pull_inbox(
     delete_after_process: bool
 ):
     """
-    Connects to the IMAP inbox, fetches new emails (last 7 days),
+    Connects to the IMAP inbox, fetches new emails (last 3 days),
     and processes attachments while preserving the original unread status.
     """
     logger.info(f"Connecting to {mailbox_key} at {host}:{port} (SSL={use_ssl})")
@@ -129,8 +129,8 @@ def pull_inbox(
         mail.login(username, password)
         mail.select("INBOX")
 
-        # Fetch emails from the last 7 days
-        since_date = (datetime.utcnow() - timedelta(days=7)).strftime("%d-%b-%Y")
+        # Fetch emails from the last 3 days
+        since_date = (datetime.utcnow() - timedelta(days=3)).strftime("%d-%b-%Y")
         status, search_data = mail.search(None, f'(SINCE {since_date})')
         if status != "OK":
             logger.warning(f"Search failed on mailbox {mailbox_key}. Status={status}")
@@ -139,7 +139,7 @@ def pull_inbox(
             return
 
         msg_numbers = search_data[0].split()
-        logger.info(f"Found {len(msg_numbers)} emails from the last 7 days in {mailbox_key}.")
+        logger.info(f"Found {len(msg_numbers)} emails from the last 3 days in {mailbox_key}.")
 
         for num in msg_numbers:
             # Check if email is unread