diff --git a/backend/crawl/crawler.py b/backend/crawl/crawler.py index d187aa6b5f94..1c31f60d5b12 100644 --- a/backend/crawl/crawler.py +++ b/backend/crawl/crawler.py @@ -1,10 +1,11 @@ -import requests -from pydantic import BaseModel -import requests +import os import re -import unicodedata import tempfile -import os +import unicodedata + +import requests +from langchain.document_loaders import GitLoader +from pydantic import BaseModel class CrawlWebsite(BaseModel): @@ -23,6 +24,7 @@ def _crawl(self, url): def process(self): content = self._crawl(self.url) + ## Create a file file_name = slugify(self.url) + ".html" temp_file_path = os.path.join(tempfile.gettempdir(), file_name) @@ -34,6 +36,12 @@ def process(self): return temp_file_path, file_name else: return None + def checkGithub(self): + if "github.com" in self.url: + return True + else: + return False + def slugify(text): diff --git a/backend/main.py b/backend/main.py index 3869a1fa1edb..1a516dc273e9 100644 --- a/backend/main.py +++ b/backend/main.py @@ -13,6 +13,7 @@ from middlewares.cors import add_cors_middleware from models.chats import ChatMessage from models.users import User +from parsers.github import process_github from utils.file import convert_bytes, get_file_size from utils.processors import filter_file from utils.vectors import (CommonsDep, create_user, similarity_search, @@ -114,18 +115,42 @@ async def chat_endpoint(commons: CommonsDep, chat_message: ChatMessage, credenti @app.post("/crawl/", dependencies=[Depends(JWTBearer())]) async def crawl_endpoint(commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False, credentials: dict = Depends(JWTBearer())): + max_brain_size = os.getenv("MAX_BRAIN_SIZE") + user = User(email=credentials.get('email', 'none')) - file_path, file_name = crawl_website.process() + user_vectors_response = commons['supabase'].table("vectors").select( + "name:metadata->>file_name, size:metadata->>file_size", count="exact") \ + .filter("user_id", "eq", user.email)\ + .execute() + documents = user_vectors_response.data # Access the data from the response + # Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary + user_unique_vectors = [dict(t) for t in set(tuple(d.items()) for d in documents)] - # Create a SpooledTemporaryFile from the file_path - spooled_file = SpooledTemporaryFile() - with open(file_path, 'rb') as f: - shutil.copyfileobj(f, spooled_file) + current_brain_size = sum(float(doc['size']) for doc in user_unique_vectors) - # Pass the SpooledTemporaryFile to UploadFile - file = UploadFile(file=spooled_file, filename=file_name) - message = await filter_file(file, enable_summarization, commons['supabase'], user=user) - return message + file_size = 1000000 + + remaining_free_space = float(max_brain_size) - (current_brain_size) + + if remaining_free_space - file_size < 0: + message = {"message": f"❌ User's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}", "type": "error"} + else: + user = User(email=credentials.get('email', 'none')) + if not crawl_website.checkGithub(): + + file_path, file_name = crawl_website.process() + + # Create a SpooledTemporaryFile from the file_path + spooled_file = SpooledTemporaryFile() + with open(file_path, 'rb') as f: + shutil.copyfileobj(f, spooled_file) + + # Pass the SpooledTemporaryFile to UploadFile + file = UploadFile(file=spooled_file, filename=file_name) + message = await filter_file(file, enable_summarization, commons['supabase'], user=user) + return message + else: + message = await process_github(crawl_website.url, "false", user=user, supabase=commons['supabase']) @app.get("/explore", dependencies=[Depends(JWTBearer())]) diff --git a/backend/parsers/common.py b/backend/parsers/common.py index 860a92bca37c..637f5850c8b2 100644 --- a/backend/parsers/common.py +++ b/backend/parsers/common.py @@ -65,3 +65,9 @@ async def file_already_exists(supabase, file, user): response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \ .filter("user_id", "eq", user.email).execute() return len(response.data) > 0 + +async def file_already_exists_from_content(supabase, file_content, user): + file_sha1 = compute_sha1_from_content(file_content) + response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \ + .filter("user_id", "eq", user.email).execute() + return len(response.data) > 0 \ No newline at end of file diff --git a/backend/parsers/github.py b/backend/parsers/github.py new file mode 100644 index 000000000000..e41881fcebf8 --- /dev/null +++ b/backend/parsers/github.py @@ -0,0 +1,53 @@ +import os +import time + +from fastapi import UploadFile +from langchain.document_loaders import GitLoader +from langchain.schema import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter +from parsers.common import file_already_exists_from_content +from utils.file import compute_sha1_from_content, compute_sha1_from_file +from utils.vectors import create_summary, create_vector, documents_vector_store + +from .common import process_file + + +async def process_github(repo, enable_summarization, user, supabase): + random_dir_name = os.urandom(16).hex() + dateshort = time.strftime("%Y%m%d") + loader = GitLoader( + clone_url=repo, + repo_path="/tmp/" + random_dir_name, + ) + documents = loader.load() + os.system("rm -rf /tmp/" + random_dir_name) + + chunk_size = 500 + chunk_overlap = 0 + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=chunk_size, chunk_overlap=chunk_overlap) + + documents = text_splitter.split_documents(documents) + print(documents[:1]) + + for doc in documents: + if doc.metadata["file_type"] in [".pyc", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git"]: + continue + metadata = { + "file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")), + "file_size": len(doc.page_content)*8, + "file_name": doc.metadata["file_name"], + "chunk_size": chunk_size, + "chunk_overlap": chunk_overlap, + "date": dateshort, + "summarization": "true" if enable_summarization else "false" + } + doc_with_metadata = Document( + page_content=doc.page_content, metadata=metadata) + exist = await file_already_exists_from_content(supabase, doc.page_content.encode("utf-8"), user) + if not exist: + create_vector(user.email, doc_with_metadata) + print("Created vector for ", doc.metadata["file_name"]) + + return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"} +