|
1 | 1 | # Copyright (C) 2024 Intel Corporation
|
2 | 2 | # SPDX-License-Identifier: Apache-2.0
|
3 | 3 |
|
4 |
| -import asyncio |
5 | 4 | import base64
|
6 | 5 | import os
|
7 | 6 | import subprocess
|
@@ -55,7 +54,7 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
|
55 | 54 | return inputs
|
56 | 55 |
|
57 | 56 |
|
58 |
| -def read_pdf(file): |
| 57 | +def read_pdf(file: str): |
59 | 58 | from langchain.document_loaders import PyPDFLoader
|
60 | 59 |
|
61 | 60 | loader = PyPDFLoader(file)
|
@@ -101,29 +100,50 @@ def video2audio(
|
101 | 100 | return audio_base64
|
102 | 101 |
|
103 | 102 |
|
104 |
| -def read_text_from_file(file, save_file_name): |
| 103 | +async def read_text_from_file(file: UploadFile): |
| 104 | + ctype = file.headers["content-type"] |
| 105 | + valid = ( |
| 106 | + "text/plain", |
| 107 | + "application/pdf", |
| 108 | + "application/octet-stream", |
| 109 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 110 | + ) |
| 111 | + |
| 112 | + file_content = None |
| 113 | + if ctype not in valid: |
| 114 | + return file_content |
| 115 | + |
| 116 | + import aiofiles |
105 | 117 | import docx2txt
|
106 | 118 | from langchain.text_splitter import CharacterTextSplitter
|
107 | 119 |
|
108 | 120 | # read text file
|
109 |
| - if file.headers["content-type"] == "text/plain": |
| 121 | + if ctype == "text/plain": |
110 | 122 | file.file.seek(0)
|
111 | 123 | content = file.file.read().decode("utf-8")
|
112 |
| - # Split text |
| 124 | + # Split text to multiple documents |
113 | 125 | text_splitter = CharacterTextSplitter()
|
114 |
| - texts = text_splitter.split_text(content) |
115 |
| - # Create multiple documents |
116 |
| - file_content = texts |
117 |
| - # read pdf file |
118 |
| - elif file.headers["content-type"] == "application/pdf": |
119 |
| - documents = read_pdf(save_file_name) |
120 |
| - file_content = [doc.page_content for doc in documents] |
121 |
| - # read docx file |
122 |
| - elif ( |
123 |
| - file.headers["content-type"] == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
124 |
| - or file.headers["content-type"] == "application/octet-stream" |
125 |
| - ): |
126 |
| - file_content = docx2txt.process(save_file_name) |
| 126 | + return text_splitter.split_text(content) |
| 127 | + |
| 128 | + # need a tmp file for rest |
| 129 | + async with aiofiles.tempfile.NamedTemporaryFile() as tmp: |
| 130 | + await tmp.write(await file.read()) |
| 131 | + await tmp.flush() |
| 132 | + |
| 133 | + # read pdf file |
| 134 | + if ctype == "application/pdf": |
| 135 | + documents = read_pdf(tmp.name) |
| 136 | + file_content = [doc.page_content for doc in documents] |
| 137 | + |
| 138 | + # read docx file |
| 139 | + if ctype in ( |
| 140 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 141 | + "application/octet-stream", |
| 142 | + ): |
| 143 | + file_content = docx2txt.process(tmp.name) |
| 144 | + |
| 145 | + # remove temp file |
| 146 | + await tmp.close() |
127 | 147 |
|
128 | 148 | return file_content
|
129 | 149 |
|
@@ -188,25 +208,14 @@ async def handle_request(self, request: Request, files: List[UploadFile] = File(
|
188 | 208 | file_summaries = []
|
189 | 209 | if files:
|
190 | 210 | for file in files:
|
191 |
| - # Fix concurrency issue with the same file name |
192 |
| - # https://github.com/opea-project/GenAIExamples/issues/1279 |
193 |
| - uid = str(uuid.uuid4()) |
194 |
| - file_path = f"/tmp/{uid}" |
195 |
| - |
196 | 211 | if data_type is not None and data_type in ["audio", "video"]:
|
197 | 212 | raise ValueError(
|
198 | 213 | "Audio and Video file uploads are not supported in docsum with curl request, \
|
199 | 214 | please use the UI or pass base64 string of the content directly."
|
200 | 215 | )
|
201 | 216 |
|
202 | 217 | else:
|
203 |
| - import aiofiles |
204 |
| - |
205 |
| - async with aiofiles.open(file_path, "wb") as f: |
206 |
| - await f.write(await file.read()) |
207 |
| - |
208 |
| - docs = read_text_from_file(file, file_path) |
209 |
| - os.remove(file_path) |
| 218 | + docs = await read_text_from_file(file) |
210 | 219 |
|
211 | 220 | if isinstance(docs, list):
|
212 | 221 | file_summaries.extend(docs)
|
|
0 commit comments