From 95912f3d04f3eb02bfda089006dea0a9557acfd9 Mon Sep 17 00:00:00 2001 From: Stan Girard Date: Sat, 27 Jan 2024 01:45:09 -0800 Subject: [PATCH] =?UTF-8?q?feat:=20=F0=9F=8E=B8=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit link --- backend/llm/knowledge_brain_qa.py | 11 ++++++++--- backend/modules/chat/dto/chats.py | 1 + backend/packages/files/parsers/audio.py | 5 +---- backend/packages/files/parsers/code_python.py | 3 ++- backend/packages/files/parsers/common.py | 9 ++------- backend/packages/files/parsers/csv.py | 6 ++---- backend/packages/files/parsers/docx.py | 3 ++- backend/packages/files/parsers/epub.py | 3 ++- backend/packages/files/parsers/github.py | 1 + backend/packages/files/parsers/html.py | 3 ++- backend/packages/files/parsers/markdown.py | 3 ++- backend/packages/files/parsers/notebook.py | 3 ++- backend/packages/files/parsers/odt.py | 3 ++- backend/packages/files/parsers/pdf.py | 3 ++- backend/packages/files/parsers/powerpoint.py | 3 ++- backend/packages/files/parsers/telegram.py | 6 ++---- backend/packages/files/parsers/txt.py | 2 ++ backend/packages/files/parsers/xlsx.py | 6 ++---- backend/packages/files/processors.py | 1 + 19 files changed, 40 insertions(+), 35 deletions(-) diff --git a/backend/llm/knowledge_brain_qa.py b/backend/llm/knowledge_brain_qa.py index 4b8f0289a3e8..5e4b06a99387 100644 --- a/backend/llm/knowledge_brain_qa.py +++ b/backend/llm/knowledge_brain_qa.py @@ -11,6 +11,7 @@ from llm.utils.format_chat_history import format_chat_history from llm.utils.get_prompt_to_use import get_prompt_to_use from llm.utils.get_prompt_to_use_id import get_prompt_to_use_id +from repository.files.generate_file_signed_url import generate_file_signed_url from logger import get_logger from models import BrainSettings from modules.brain.service.brain_service import BrainService @@ -81,7 +82,6 @@ def __init__( streaming: bool = False, prompt_id: Optional[UUID] = None, metadata: Optional[dict] = None, - **kwargs, ): super().__init__( @@ -313,9 +313,14 @@ async def wrap_done(fn: Awaitable, event: asyncio.Event): if "url" in doc.metadata else doc.metadata["file_name"], "type": "url" if "url" in doc.metadata else "file", - "source_url": doc.metadata["url"] - if "url" in doc.metadata + "source_url": generate_file_signed_url( + f"{brain.brain_id}/{doc.metadata['file_name']}" + ).get("signedURL", "") + if "url" not in doc.metadata else "", + "original_file_name": doc.metadata[ + "original_file_name" + ], } ) ) diff --git a/backend/modules/chat/dto/chats.py b/backend/modules/chat/dto/chats.py index 5a76de77f36f..6a46abff17c6 100644 --- a/backend/modules/chat/dto/chats.py +++ b/backend/modules/chat/dto/chats.py @@ -32,6 +32,7 @@ class Sources(BaseModel): name: str source_url: str type: str + original_file_name: str class Config: json_encoders = { diff --git a/backend/packages/files/parsers/audio.py b/backend/packages/files/parsers/audio.py index 6f210f383f52..fd6d8d578879 100644 --- a/backend/packages/files/parsers/audio.py +++ b/backend/packages/files/parsers/audio.py @@ -9,10 +9,7 @@ from packages.files.file import compute_sha1_from_content -async def process_audio( - file: File, - user, -): +async def process_audio(file: File, user, original_file_name): temp_filename = None file_sha = "" dateshort = time.strftime("%Y%m%d-%H%M%S") diff --git a/backend/packages/files/parsers/code_python.py b/backend/packages/files/parsers/code_python.py index 4806424f3da3..95dfcbd87444 100644 --- a/backend/packages/files/parsers/code_python.py +++ b/backend/packages/files/parsers/code_python.py @@ -4,9 +4,10 @@ from .common import process_file -async def process_python(file: File, brain_id): +async def process_python(file: File, brain_id, original_file_name): return await process_file( file=file, loader_class=PythonLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/common.py b/backend/packages/files/parsers/common.py index 681ec7d7d530..759189088cfc 100644 --- a/backend/packages/files/parsers/common.py +++ b/backend/packages/files/parsers/common.py @@ -2,7 +2,6 @@ from logger import get_logger from models import File -from models.settings import get_supabase_db from modules.brain.service.brain_vector_service import BrainVectorService from packages.embeddings.vectors import Neurons from repository.files.upload_file import DocumentSerializable @@ -10,12 +9,7 @@ logger = get_logger(__name__) -async def process_file( - file: File, - loader_class, - brain_id, -): - database = get_supabase_db() +async def process_file(file: File, loader_class, brain_id, original_file_name): dateshort = time.strftime("%Y%m%d") neurons = Neurons() @@ -28,6 +22,7 @@ async def process_file( "chunk_size": file.chunk_size, "chunk_overlap": file.chunk_overlap, "date": dateshort, + "original_file_name": original_file_name or file.file_name, } docs = [] diff --git a/backend/packages/files/parsers/csv.py b/backend/packages/files/parsers/csv.py index 470515b994e8..7e539fff11f2 100644 --- a/backend/packages/files/parsers/csv.py +++ b/backend/packages/files/parsers/csv.py @@ -4,12 +4,10 @@ from .common import process_file -def process_csv( - file: File, - brain_id, -): +def process_csv(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=CSVLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/docx.py b/backend/packages/files/parsers/docx.py index 3b26c1a8fee1..2a98b0c50de9 100644 --- a/backend/packages/files/parsers/docx.py +++ b/backend/packages/files/parsers/docx.py @@ -4,9 +4,10 @@ from .common import process_file -def process_docx(file: File, brain_id): +def process_docx(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=Docx2txtLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/epub.py b/backend/packages/files/parsers/epub.py index 695212aae083..49b2e4e6b560 100644 --- a/backend/packages/files/parsers/epub.py +++ b/backend/packages/files/parsers/epub.py @@ -4,9 +4,10 @@ from .common import process_file -def process_epub(file: File, brain_id): +def process_epub(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredEPubLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/github.py b/backend/packages/files/parsers/github.py index 1c2071983a49..44d4ef577e29 100644 --- a/backend/packages/files/parsers/github.py +++ b/backend/packages/files/parsers/github.py @@ -52,6 +52,7 @@ async def process_github( "chunk_size": chunk_size, "chunk_overlap": chunk_overlap, "date": dateshort, + "original_file_name": doc.metadata["original_file_name"], } doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata) diff --git a/backend/packages/files/parsers/html.py b/backend/packages/files/parsers/html.py index 3e247cc9c5f8..620419f26a46 100644 --- a/backend/packages/files/parsers/html.py +++ b/backend/packages/files/parsers/html.py @@ -4,9 +4,10 @@ from .common import process_file -def process_html(file: File, brain_id): +def process_html(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredHTMLLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/markdown.py b/backend/packages/files/parsers/markdown.py index a10f5edbca2f..600da1e36c95 100644 --- a/backend/packages/files/parsers/markdown.py +++ b/backend/packages/files/parsers/markdown.py @@ -4,9 +4,10 @@ from .common import process_file -def process_markdown(file: File, brain_id): +def process_markdown(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredMarkdownLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/notebook.py b/backend/packages/files/parsers/notebook.py index a610b8f44554..7cbb1db58ed4 100644 --- a/backend/packages/files/parsers/notebook.py +++ b/backend/packages/files/parsers/notebook.py @@ -4,9 +4,10 @@ from .common import process_file -def process_ipnyb(file: File, brain_id): +def process_ipnyb(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=NotebookLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/odt.py b/backend/packages/files/parsers/odt.py index 5c57de39eb65..a13fdf254407 100644 --- a/backend/packages/files/parsers/odt.py +++ b/backend/packages/files/parsers/odt.py @@ -4,9 +4,10 @@ from .common import process_file -def process_odt(file: File, brain_id): +def process_odt(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredPDFLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/pdf.py b/backend/packages/files/parsers/pdf.py index fbc9e771f550..0b138a21484f 100644 --- a/backend/packages/files/parsers/pdf.py +++ b/backend/packages/files/parsers/pdf.py @@ -4,9 +4,10 @@ from .common import process_file -def process_pdf(file: File, brain_id): +def process_pdf(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredPDFLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/powerpoint.py b/backend/packages/files/parsers/powerpoint.py index 2f6c49a26fa4..3c02d1456ff3 100644 --- a/backend/packages/files/parsers/powerpoint.py +++ b/backend/packages/files/parsers/powerpoint.py @@ -4,9 +4,10 @@ from .common import process_file -def process_powerpoint(file: File, brain_id): +def process_powerpoint(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredPowerPointLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/telegram.py b/backend/packages/files/parsers/telegram.py index 071cc4c794f9..416a7b53d33c 100644 --- a/backend/packages/files/parsers/telegram.py +++ b/backend/packages/files/parsers/telegram.py @@ -4,12 +4,10 @@ from .common import process_file -def process_telegram( - file: File, - brain_id, -): +def process_telegram(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=TelegramChatFileLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/txt.py b/backend/packages/files/parsers/txt.py index b38b9318c61e..3e31cd3be04f 100644 --- a/backend/packages/files/parsers/txt.py +++ b/backend/packages/files/parsers/txt.py @@ -7,9 +7,11 @@ async def process_txt( file: File, brain_id, + original_file_name, ): return await process_file( file=file, loader_class=TextLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/parsers/xlsx.py b/backend/packages/files/parsers/xlsx.py index c3c5d8f64952..e349bbd70d67 100644 --- a/backend/packages/files/parsers/xlsx.py +++ b/backend/packages/files/parsers/xlsx.py @@ -4,12 +4,10 @@ from .common import process_file -def process_xlsx( - file: File, - brain_id, -): +def process_xlsx(file: File, brain_id, original_file_name): return process_file( file=file, loader_class=UnstructuredExcelLoader, brain_id=brain_id, + original_file_name=original_file_name, ) diff --git a/backend/packages/files/processors.py b/backend/packages/files/processors.py index 5a6a279f5aae..d7f903dd0901 100644 --- a/backend/packages/files/processors.py +++ b/backend/packages/files/processors.py @@ -86,6 +86,7 @@ async def filter_file( result = await file_processors[file.file_extension]( file=file, brain_id=brain_id, + original_file_name=original_file_name, ) if result is None or result == 0: return create_response(