Skip to content

Commit 02628dc

Browse files
committed
Improve DocSum file handling
Use temporary file only when necessary, and use aiofiles own functionality for that. Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
1 parent d4dcbd1 commit 02628dc

File tree

1 file changed

+39
-30
lines changed

1 file changed

+39
-30
lines changed

DocSum/docsum.py

+39-30
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Copyright (C) 2024 Intel Corporation
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import asyncio
54
import base64
65
import os
76
import subprocess
@@ -55,7 +54,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
5554
return inputs
5655

5756

58-
def read_pdf(file):
57+
def read_pdf(file: str):
5958
from langchain.document_loaders import PyPDFLoader
6059

6160
loader = PyPDFLoader(file)
@@ -101,29 +100,50 @@ def video2audio(
101100
return audio_base64
102101

103102

104-
def read_text_from_file(file, save_file_name):
103+
async def read_text_from_file(file: UploadFile):
104+
ctype = file.headers["content-type"]
105+
valid = (
106+
"text/plain",
107+
"application/pdf",
108+
"application/octet-stream",
109+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
110+
)
111+
112+
file_content = None
113+
if ctype not in valid:
114+
return file_content
115+
116+
import aiofiles
105117
import docx2txt
106118
from langchain.text_splitter import CharacterTextSplitter
107119

108120
# read text file
109-
if file.headers["content-type"] == "text/plain":
121+
if ctype == "text/plain":
110122
file.file.seek(0)
111123
content = file.file.read().decode("utf-8")
112-
# Split text
124+
# Split text to multiple documents
113125
text_splitter = CharacterTextSplitter()
114-
texts = text_splitter.split_text(content)
115-
# Create multiple documents
116-
file_content = texts
117-
# read pdf file
118-
elif file.headers["content-type"] == "application/pdf":
119-
documents = read_pdf(save_file_name)
120-
file_content = [doc.page_content for doc in documents]
121-
# read docx file
122-
elif (
123-
file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
124-
or file.headers["content-type"] == "application/octet-stream"
125-
):
126-
file_content = docx2txt.process(save_file_name)
126+
return text_splitter.split_text(content)
127+
128+
# need a tmp file for rest
129+
async with aiofiles.tempfile.NamedTemporaryFile() as tmp:
130+
await tmp.write(await file.read())
131+
await tmp.flush()
132+
133+
# read pdf file
134+
if ctype == "application/pdf":
135+
documents = read_pdf(tmp.name)
136+
file_content = [doc.page_content for doc in documents]
137+
138+
# read docx file
139+
if ctype in (
140+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
141+
"application/octet-stream",
142+
):
143+
file_content = docx2txt.process(tmp.name)
144+
145+
# remove temp file
146+
await tmp.close()
127147

128148
return file_content
129149

@@ -188,25 +208,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
188208
file_summaries = []
189209
if files:
190210
for file in files:
191-
# Fix concurrency issue with the same file name
192-
# https://github.com/opea-project/GenAIExamples/issues/1279
193-
uid = str(uuid.uuid4())
194-
file_path = f"/tmp/{uid}"
195-
196211
if data_type is not None and data_type in ["audio", "video"]:
197212
raise ValueError(
198213
"Audio and Video file uploads are not supported in docsum with curl request, \
199214
please use the UI or pass base64 string of the content directly."
200215
)
201216

202217
else:
203-
import aiofiles
204-
205-
async with aiofiles.open(file_path, "wb") as f:
206-
await f.write(await file.read())
207-
208-
docs = read_text_from_file(file, file_path)
209-
os.remove(file_path)
218+
docs = await read_text_from_file(file)
210219

211220
if isinstance(docs, list):
212221
file_summaries.extend(docs)

0 commit comments

Comments
 (0)