Skip to content

Improve DocSum file handling #1562

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 39 additions & 30 deletions DocSum/docsum.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import asyncio
import base64
import os
import subprocess
Expand Down Expand Up @@ -55,7 +54,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
return inputs


def read_pdf(file):
def read_pdf(file: str):
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(file)
Expand Down Expand Up @@ -101,29 +100,50 @@ def video2audio(
return audio_base64


def read_text_from_file(file, save_file_name):
async def read_text_from_file(file: UploadFile):
ctype = file.headers["content-type"]
valid = (
"text/plain",
"application/pdf",
"application/octet-stream",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)

file_content = None
if ctype not in valid:
return file_content

import aiofiles
import docx2txt
from langchain.text_splitter import CharacterTextSplitter

# read text file
if file.headers["content-type"] == "text/plain":
if ctype == "text/plain":
file.file.seek(0)
content = file.file.read().decode("utf-8")
# Split text
# Split text to multiple documents
text_splitter = CharacterTextSplitter()
texts = text_splitter.split_text(content)
# Create multiple documents
file_content = texts
# read pdf file
elif file.headers["content-type"] == "application/pdf":
documents = read_pdf(save_file_name)
file_content = [doc.page_content for doc in documents]
# read docx file
elif (
file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
or file.headers["content-type"] == "application/octet-stream"
):
file_content = docx2txt.process(save_file_name)
return text_splitter.split_text(content)

# need a tmp file for rest
async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
await tmp.write(await file.read())
await tmp.flush()

# read pdf file
if ctype == "application/pdf":
documents = read_pdf(tmp.name)
file_content = [doc.page_content for doc in documents]

# read docx file
if ctype in (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/octet-stream",
):
file_content = docx2txt.process(tmp.name)

# remove temp file
await tmp.close()

return file_content

Expand Down Expand Up @@ -188,25 +208,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
file_summaries = []
if files:
for file in files:
# Fix concurrency issue with the same file name
# https://github.com/opea-project/GenAIExamples/issues/1279
uid = str(uuid.uuid4())
file_path = f"/tmp/{uid}"

if data_type is not None and data_type in ["audio", "video"]:
raise ValueError(
"Audio and Video file uploads are not supported in docsum with curl request, \
please use the UI or pass base64 string of the content directly."
)

else:
import aiofiles

async with aiofiles.open(file_path, "wb") as f:
await f.write(await file.read())

docs = read_text_from_file(file, file_path)
os.remove(file_path)
docs = await read_text_from_file(file)

if isinstance(docs, list):
file_summaries.extend(docs)
Expand Down