Skip to content

Commit

Permalink
Shorter pull period for mails, added a bit of different handling for …
Browse files Browse the repository at this point in the history
…paperless
  • Loading branch information
christianlouis committed Feb 18, 2025
1 parent c0c5ca3 commit de7050a
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 276 deletions.
43 changes: 27 additions & 16 deletions app/tasks/extract_metadata_with_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,33 @@ def extract_json_from_text(text):

@celery.task(base=BaseTaskWithRetry)
def extract_metadata_with_gpt(s3_filename: str, cleaned_text: str):
"""Uses OpenAI GPT-4o to classify document metadata."""
"""Uses OpenAI GPT-4o-mini to classify document metadata."""

prompt = f"""
You are an intelligent document classifier.
Given the following extracted text from a document, analyze it and return a JSON object with the following fields:
1. "filename": A machine-readable filename in the format YYYY-MM-DD_DescriptiveTitle (use only letters, numbers, periods, and underscores).
2. "empfaenger": The recipient, or "Unknown" if not found.
3. "absender": The sender, or "Unknown" if not found.
4. "correspondent": A correspondent extracted from the document, or "Unknown".
5. "kommunikationsart": One of [Behoerdlicher_Brief, Rechnung, Kontoauszug, Vertrag, Quittung, Privater_Brief, Einladung, Gewerbliche_Korrespondenz, Newsletter, Werbung, Sonstiges].
6. "kommunikationskategorie": One of [Amtliche_Postbehoerdliche_Dokumente, Finanz_und_Vertragsdokumente, Geschaeftliche_Kommunikation, Private_Korrespondenz, Sonstige_Informationen].
7. "document_type": The document type, or "Unknown".
8. "tags": A list of additional keywords extracted from the document.
9. "language": The detected language code (e.g., "DE").
10. "title": A human-friendly title for the document.
You are a specialized document analyzer trained to extract structured metadata from documents.
Your task is to analyze the given text and return a well-structured JSON object.
Extract and return the following fields:
1. **filename**: Machine-readable filename (YYYY-MM-DD_DescriptiveTitle, use only letters, numbers, periods, and underscores).
2. **empfaenger**: The recipient, or "Unknown" if not found.
3. **absender**: The sender, or "Unknown" if not found.
4. **correspondent**: The entity or company that issued the document (shortest possible name, e.g., "Amazon" instead of "Amazon EU SARL, German branch").
5. **kommunikationsart**: One of [Behoerdlicher_Brief, Rechnung, Kontoauszug, Vertrag, Quittung, Privater_Brief, Einladung, Gewerbliche_Korrespondenz, Newsletter, Werbung, Sonstiges].
6. **kommunikationskategorie**: One of [Amtliche_Postbehoerdliche_Dokumente, Finanz_und_Vertragsdokumente, Geschaeftliche_Kommunikation, Private_Korrespondenz, Sonstige_Informationen].
7. **document_type**: Precise classification (e.g., Invoice, Contract, Information, Unknown).
8. **tags**: A list of up to 4 relevant thematic keywords.
9. **language**: Detected document language (ISO 639-1 code, e.g., "de" or "en").
10. **title**: A human-readable title summarizing the document content.
11. **confidence_score**: A numeric value (0-100) indicating the confidence level of the extracted metadata.
12. **reference_number**: Extracted invoice/order/reference number if available.
13. **monetary_amounts**: A list of key monetary values detected in the document.
### Important Rules:
- **OCR Correction**: Assume the text has been corrected for OCR errors.
- **Tagging**: Max 4 tags, avoiding generic or overly specific terms.
- **Title**: Concise, no addresses, and contains key identifying features.
- **Date Selection**: Use the most relevant date if multiple are found.
- **Output Language**: Maintain the document's original language.
Extracted text:
{cleaned_text}
Expand All @@ -56,7 +68,7 @@ def extract_metadata_with_gpt(s3_filename: str, cleaned_text: str):
try:
print(f"[DEBUG] Sending classification request for {s3_filename}...")
completion = client.chat.completions.create(
model="gpt-4o",
model="gpt-4o-mini",
messages=[
{"role": "system", "content": "You are an intelligent document classifier."},
{"role": "user", "content": prompt}
Expand All @@ -83,4 +95,3 @@ def extract_metadata_with_gpt(s3_filename: str, cleaned_text: str):
except Exception as e:
print(f"[ERROR] OpenAI classification failed for {s3_filename}: {e}")
return {}

8 changes: 4 additions & 4 deletions app/tasks/imap_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def pull_inbox(
delete_after_process: bool
):
"""
Connects to the IMAP inbox, fetches new emails (last 7 days),
Connects to the IMAP inbox, fetches new emails (last 3 days),
and processes attachments while preserving the original unread status.
"""
logger.info(f"Connecting to {mailbox_key} at {host}:{port} (SSL={use_ssl})")
Expand All @@ -129,8 +129,8 @@ def pull_inbox(
mail.login(username, password)
mail.select("INBOX")

# Fetch emails from the last 7 days
since_date = (datetime.utcnow() - timedelta(days=7)).strftime("%d-%b-%Y")
# Fetch emails from the last 3 days
since_date = (datetime.utcnow() - timedelta(days=3)).strftime("%d-%b-%Y")
status, search_data = mail.search(None, f'(SINCE {since_date})')
if status != "OK":
logger.warning(f"Search failed on mailbox {mailbox_key}. Status={status}")
Expand All @@ -139,7 +139,7 @@ def pull_inbox(
return

msg_numbers = search_data[0].split()
logger.info(f"Found {len(msg_numbers)} emails from the last 7 days in {mailbox_key}.")
logger.info(f"Found {len(msg_numbers)} emails from the last 3 days in {mailbox_key}.")

for num in msg_numbers:
# Check if email is unread
Expand Down
Loading

0 comments on commit de7050a

Please sign in to comment.