Skip to content

Commit

Permalink
feat(github): now github loader (#264)
Browse files Browse the repository at this point in the history
  • Loading branch information
StanGirard authored Jun 5, 2023
1 parent 000933f commit 963fb05
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 14 deletions.
18 changes: 13 additions & 5 deletions backend/crawl/crawler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import requests
from pydantic import BaseModel
import requests
import os
import re
import unicodedata
import tempfile
import os
import unicodedata

import requests
from langchain.document_loaders import GitLoader
from pydantic import BaseModel


class CrawlWebsite(BaseModel):
Expand All @@ -23,6 +24,7 @@ def _crawl(self, url):

def process(self):
content = self._crawl(self.url)

## Create a file
file_name = slugify(self.url) + ".html"
temp_file_path = os.path.join(tempfile.gettempdir(), file_name)
Expand All @@ -34,6 +36,12 @@ def process(self):
return temp_file_path, file_name
else:
return None
def checkGithub(self):
if "github.com" in self.url:
return True
else:
return False



def slugify(text):
Expand Down
43 changes: 34 additions & 9 deletions backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from middlewares.cors import add_cors_middleware
from models.chats import ChatMessage
from models.users import User
from parsers.github import process_github
from utils.file import convert_bytes, get_file_size
from utils.processors import filter_file
from utils.vectors import (CommonsDep, create_user, similarity_search,
Expand Down Expand Up @@ -114,18 +115,42 @@ async def chat_endpoint(commons: CommonsDep, chat_message: ChatMessage, credenti

@app.post("/crawl/", dependencies=[Depends(JWTBearer())])
async def crawl_endpoint(commons: CommonsDep, crawl_website: CrawlWebsite, enable_summarization: bool = False, credentials: dict = Depends(JWTBearer())):
max_brain_size = os.getenv("MAX_BRAIN_SIZE")

user = User(email=credentials.get('email', 'none'))
file_path, file_name = crawl_website.process()
user_vectors_response = commons['supabase'].table("vectors").select(
"name:metadata->>file_name, size:metadata->>file_size", count="exact") \
.filter("user_id", "eq", user.email)\
.execute()
documents = user_vectors_response.data # Access the data from the response
# Convert each dictionary to a tuple of items, then to a set to remove duplicates, and then back to a dictionary
user_unique_vectors = [dict(t) for t in set(tuple(d.items()) for d in documents)]

# Create a SpooledTemporaryFile from the file_path
spooled_file = SpooledTemporaryFile()
with open(file_path, 'rb') as f:
shutil.copyfileobj(f, spooled_file)
current_brain_size = sum(float(doc['size']) for doc in user_unique_vectors)

# Pass the SpooledTemporaryFile to UploadFile
file = UploadFile(file=spooled_file, filename=file_name)
message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
return message
file_size = 1000000

remaining_free_space = float(max_brain_size) - (current_brain_size)

if remaining_free_space - file_size < 0:
message = {"message": f"❌ User's brain will exceed maximum capacity with this upload. Maximum file allowed is : {convert_bytes(remaining_free_space)}", "type": "error"}
else:
user = User(email=credentials.get('email', 'none'))
if not crawl_website.checkGithub():

file_path, file_name = crawl_website.process()

# Create a SpooledTemporaryFile from the file_path
spooled_file = SpooledTemporaryFile()
with open(file_path, 'rb') as f:
shutil.copyfileobj(f, spooled_file)

# Pass the SpooledTemporaryFile to UploadFile
file = UploadFile(file=spooled_file, filename=file_name)
message = await filter_file(file, enable_summarization, commons['supabase'], user=user)
return message
else:
message = await process_github(crawl_website.url, "false", user=user, supabase=commons['supabase'])


@app.get("/explore", dependencies=[Depends(JWTBearer())])
Expand Down
6 changes: 6 additions & 0 deletions backend/parsers/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,9 @@ async def file_already_exists(supabase, file, user):
response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \
.filter("user_id", "eq", user.email).execute()
return len(response.data) > 0

async def file_already_exists_from_content(supabase, file_content, user):
file_sha1 = compute_sha1_from_content(file_content)
response = supabase.table("vectors").select("id").filter("metadata->>file_sha1", "eq", file_sha1) \
.filter("user_id", "eq", user.email).execute()
return len(response.data) > 0
53 changes: 53 additions & 0 deletions backend/parsers/github.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import time

from fastapi import UploadFile
from langchain.document_loaders import GitLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from parsers.common import file_already_exists_from_content
from utils.file import compute_sha1_from_content, compute_sha1_from_file
from utils.vectors import create_summary, create_vector, documents_vector_store

from .common import process_file


async def process_github(repo, enable_summarization, user, supabase):
random_dir_name = os.urandom(16).hex()
dateshort = time.strftime("%Y%m%d")
loader = GitLoader(
clone_url=repo,
repo_path="/tmp/" + random_dir_name,
)
documents = loader.load()
os.system("rm -rf /tmp/" + random_dir_name)

chunk_size = 500
chunk_overlap = 0
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=chunk_size, chunk_overlap=chunk_overlap)

documents = text_splitter.split_documents(documents)
print(documents[:1])

for doc in documents:
if doc.metadata["file_type"] in [".pyc", ".env", ".lock", ".gitignore", ".gitmodules", ".gitattributes", ".gitkeep", ".git"]:
continue
metadata = {
"file_sha1": compute_sha1_from_content(doc.page_content.encode("utf-8")),
"file_size": len(doc.page_content)*8,
"file_name": doc.metadata["file_name"],
"chunk_size": chunk_size,
"chunk_overlap": chunk_overlap,
"date": dateshort,
"summarization": "true" if enable_summarization else "false"
}
doc_with_metadata = Document(
page_content=doc.page_content, metadata=metadata)
exist = await file_already_exists_from_content(supabase, doc.page_content.encode("utf-8"), user)
if not exist:
create_vector(user.email, doc_with_metadata)
print("Created vector for ", doc.metadata["file_name"])

return {"message": f"✅ Github with {len(documents)} files has been uploaded.", "type": "success"}

1 comment on commit 963fb05

@vercel
Copy link

@vercel vercel bot commented on 963fb05 Jun 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.