From 5380f2a3b457088817aa76298dfe431013447a9d Mon Sep 17 00:00:00 2001 From: Luis Guillen <81443577+LuisGuillen03@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:07:23 -0600 Subject: [PATCH] [python] New Hal9 App (#460) --- apps/hal9/.vscode/launch.json | 21 -- apps/hal9/app.py | 104 +++--- apps/hal9/clients.py | 15 + apps/hal9/data/__init__.py | 13 + apps/hal9/{tools => data}/hal9.txt | 4 +- apps/hal9/requirements.txt | 6 +- apps/hal9/tools/analyst.py | 123 ------- apps/hal9/tools/calculator.py | 40 ++- apps/hal9/tools/csv.py | 72 ---- apps/hal9/tools/csv_agent.py | 509 +++++++++++++++++++++++++++++ apps/hal9/tools/document.py | 226 ------------- apps/hal9/tools/game.py | 81 ----- apps/hal9/tools/generic.py | 53 +-- apps/hal9/tools/hal9.py | 52 +-- apps/hal9/tools/image.py | 23 -- apps/hal9/tools/image_agent.py | 377 +++++++++++++++++++++ apps/hal9/tools/image_analyzer.py | 33 -- apps/hal9/tools/streamlit.py | 119 +++++-- apps/hal9/tools/text_agent.py | 338 +++++++++++++++++++ apps/hal9/tools/website.py | 23 +- apps/hal9/utils.py | 329 +++++++++++++++++++ 21 files changed, 1830 insertions(+), 731 deletions(-) delete mode 100644 apps/hal9/.vscode/launch.json create mode 100644 apps/hal9/clients.py create mode 100644 apps/hal9/data/__init__.py rename apps/hal9/{tools => data}/hal9.txt (93%) delete mode 100644 apps/hal9/tools/analyst.py delete mode 100644 apps/hal9/tools/csv.py create mode 100644 apps/hal9/tools/csv_agent.py delete mode 100644 apps/hal9/tools/document.py delete mode 100644 apps/hal9/tools/game.py delete mode 100644 apps/hal9/tools/image.py create mode 100644 apps/hal9/tools/image_agent.py delete mode 100644 apps/hal9/tools/image_analyzer.py create mode 100644 apps/hal9/tools/text_agent.py create mode 100644 apps/hal9/utils.py diff --git a/apps/hal9/.vscode/launch.json b/apps/hal9/.vscode/launch.json deleted file mode 100644 index 59ad1159..00000000 --- a/apps/hal9/.vscode/launch.json +++ /dev/null @@ -1,21 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python Debugger: Current File", - "type": "debugpy", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "justMyCode": false, - "env": { - "CONNECTION": "", - "OPENAI_AZURE": "", - "GROQ_API_KEY": "" - } - } - ] -} \ No newline at end of file diff --git a/apps/hal9/app.py b/apps/hal9/app.py index 7c687510..34754d71 100644 --- a/apps/hal9/app.py +++ b/apps/hal9/app.py @@ -1,61 +1,43 @@ -from groq import Groq -import os -import hal9 as h9 -import json -import openai - -from tools.calculator import calculate -from tools.game import build_game -from tools.generic import generic_reply -from tools.hal9 import hal9_reply -from tools.website import build_website -from tools.streamlit import build_streamlit -from tools.image import create_image -from tools.document import document_reply -from tools.csv import csv_reply -from tools.image_analyzer import image_analyzer - -MODEL = "llama3-groq-70b-8192-tool-use-preview" -def run(messages, tools): - return Groq().chat.completions.create( - model = MODEL, - messages = messages, - temperature = 0, - seed = 1, - tools=tools, - tool_choice = "required",) - -prompt = input("") -h9.event('prompt', prompt) - -messages = h9.load("messages", []) -if len(messages) <= 0: - messages.append({"role": "system", "content": "You are Hal9, a helpful and highly capable AI assistant. Your primary responsibility is to analyze user questions and select the most appropriate tool to provide precise, relevant, and actionable responses. Always prioritize using the right tool to ensure efficiency and clarity in your answers."}) -messages.append({"role": "user", "content": prompt}) -h9.save("messages", messages, hidden=True) - -all_tools = [ - calculate, - build_game, - generic_reply, - hal9_reply, - build_website, - build_streamlit, - create_image, - document_reply, - csv_reply, - image_analyzer -] - -tools = h9.describe(all_tools, model = "llama") - -try: - completion = run(messages, tools) - h9.complete(completion, messages = messages, tools = all_tools, show = False, model = "llama") -except Exception as e: - h9.event('error', str(e)) - one_tool = h9.describe([generic_reply], model = "llama") - completion = run(messages, one_tool) - h9.complete(completion, messages = messages, tools = [generic_reply], show = False, model = "llama") - -h9.save("messages", messages, hidden=True) +from utils import generate_response, load_messages, insert_message, execute_function, save_messages, insert_tool_message, is_url, download_file, generate_text_embeddings_parquet +from tools.calculator import solve_math_problem_description, solve_math_problem +from tools.generic import answer_generic_question_description, answer_generic_question +from tools.csv_agent import analyze_csv_description, analyze_csv +from tools.image_agent import images_management_system, images_management_system_description, add_images_descriptions +from tools.hal9 import answer_hal9_questions_description, answer_hal9_questions +from tools.text_agent import analyze_text_file_description, analyze_text_file +from tools.streamlit import streamlit_generator, streamlit_generator_description +from tools.website import website_generator, website_generator_description + +# load messages +messages = load_messages() + +# load tools +tools_descriptions = [solve_math_problem_description, answer_generic_question_description, analyze_csv_description, images_management_system_description, answer_hal9_questions_description, analyze_text_file_description, streamlit_generator_description, website_generator_description] +tools_functions = [solve_math_problem, answer_generic_question, analyze_csv, images_management_system, answer_hal9_questions, analyze_text_file, streamlit_generator, website_generator] + +if len(messages) < 1: + messages = insert_message(messages, "system", "You are Hal9, a helpful and highly capable AI assistant. Your primary responsibility is to analyze user questions and select the most appropriate tool to provide precise, relevant, and actionable responses. Always prioritize using the right tool to ensure efficiency and clarity in your answers.") + +user_input = input() +if is_url(user_input): + filename = user_input.split("/")[-1] + file_extension = filename.split(".")[-1] if "." in filename else "No extension" + download_file(user_input) + messages = insert_message(messages, "system", f"Consider using the file available at path: './.storage/.{filename}' for the following questions.") + messages = insert_message(messages, "assistant", f"Im ready to response questions about your file: {filename}") + if file_extension.lower() == "pdf": + generate_text_embeddings_parquet(user_input) + if file_extension.lower() in ['jpg', 'jpeg', 'png','webp']: + add_images_descriptions(f"./.storage/.{filename}") + print(f"Im ready to response questions about your file: {filename}") +else: + user_input = user_input.replace("\f", "\n") + messages = insert_message(messages, "user", user_input) + + response = generate_response("openai", "gpt-4-turbo", messages, tools_descriptions, tool_choice = "required", parallel_tool_calls=False) + + tool_result = execute_function(response, tools_functions) + + insert_tool_message(messages, response, tool_result) + +save_messages(messages) \ No newline at end of file diff --git a/apps/hal9/clients.py b/apps/hal9/clients.py new file mode 100644 index 00000000..05848ca8 --- /dev/null +++ b/apps/hal9/clients.py @@ -0,0 +1,15 @@ +from openai import AzureOpenAI, OpenAI +import os + +# Azure - OpenAI (gpt-4) +azure_openai_client = AzureOpenAI( + azure_endpoint = 'https://openai-hal9.openai.azure.com/', + api_key = os.environ['OPENAI_AZURE'], + api_version = '2024-10-01-preview', +) + +# o1 Client +openai_client = OpenAI( + base_url="https://api.hal9.com/proxy/server=https://api.openai.com/v1/", + api_key = "hal9" +) \ No newline at end of file diff --git a/apps/hal9/data/__init__.py b/apps/hal9/data/__init__.py new file mode 100644 index 00000000..2d62c77e --- /dev/null +++ b/apps/hal9/data/__init__.py @@ -0,0 +1,13 @@ +import os + +def load_data(): + data = {} + base_path = os.path.dirname(__file__) + for file_name in os.listdir(base_path): + if file_name.endswith(".txt"): + data_name = os.path.splitext(file_name)[0] + with open(os.path.join(base_path, file_name), 'r') as file: + data[data_name] = file.read() + return data + +DATA = load_data() \ No newline at end of file diff --git a/apps/hal9/tools/hal9.txt b/apps/hal9/data/hal9.txt similarity index 93% rename from apps/hal9/tools/hal9.txt rename to apps/hal9/data/hal9.txt index bab4f850..bf34b87e 100644 --- a/apps/hal9/tools/hal9.txt +++ b/apps/hal9/data/hal9.txt @@ -1,9 +1,7 @@ You can reply in the following ways: - Reply with general knowledge answers -- If they ask about capabilities of this chat you can mention you support doing math, building games, answering questions about Hal9, building websited. -- For numerical problems that may involve calculation or files like Excel, indicate that you recomment them to ask instead the CSV builder chatbot, this chatbot can be found scrolling down in the Hal9 website. -- If the user wants to provide reference documents like PDF files, recomment using the PDF chatbot, this chatbot can be found scrolling down in the Hal9 website. +- If they ask about capabilities of this chat you can mention you support multiple tools as solve math problems, CSV Analytics, image generation, generic answers, websites creation, games generation, streamlit dashboars - If the user is just curious about what Hal9 is, ask them questions about their work and creatievely explore how automating tasks with AI can help them find value. If they want custom demos or have questions that might require a team of experts to be involved encourage them to chat with our team using https://calendly.com/javierluraschi/meet Some more information about Hal9: diff --git a/apps/hal9/requirements.txt b/apps/hal9/requirements.txt index 8b6939da..2dd3e436 100644 --- a/apps/hal9/requirements.txt +++ b/apps/hal9/requirements.txt @@ -1,5 +1,5 @@ -replicate==1.0.3 -groq==0.12.0 openai==1.55.3 httpx==0.27.2 -typing-extensions>=4.11.0,<5.0.0 \ No newline at end of file +typing-extensions>=4.11.0,<5.0.0 +kaleido==0.2.1 +replicate==1.0.3 \ No newline at end of file diff --git a/apps/hal9/tools/analyst.py b/apps/hal9/tools/analyst.py deleted file mode 100644 index 4f9c6802..00000000 --- a/apps/hal9/tools/analyst.py +++ /dev/null @@ -1,123 +0,0 @@ -import sys -import json -import os -import requests -import _tools_utils -import pandas as pd -import time - -data_url = sys.argv[1] -prompt = sys.argv[2] -memory_path = os.environ['PWD'] + f"/dist/{sys.argv[8]}.json" -agentMemory = json.loads(sys.argv[9]) - -if not prompt: - exit() - -def download_csv(url, filename): - response = requests.get(url) - if response.status_code == 200: - with open(filename, 'wb') as f: - f.write(response.content) - return True - else: - return False - -if len(agentMemory) == 0: - # Extract the name from the connection - csv_name=data_url.rsplit('/', 1)[-1] - - # Temp download the csv file - download_csv(data_url,csv_name) - - # Generate Openai FileObject - file = _tools_utils.client.files.create(file=open(csv_name, "rb"),purpose='assistants') - - # Extract columns names and Dtypes - df = pd.read_csv(csv_name) - x = pd.DataFrame(df.dtypes).reset_index().rename({'index': 'column', 0: 'data type'}, axis='columns') - x['data type'] = x['data type'].apply(lambda x: str(x)) - schema = x.to_json(orient='records', lines=True) - - # Remove the file already uploaded to OpenAI and schema extracted - os.remove(csv_name) - - # Create the assistant provided with the CSV - assistant = _tools_utils.client.beta.assistants.create( - name="Data Analyst", - description="""You are a experienced Data Analyst, equipped with the skills to process and analyze data from a CSV file using Python. - You can interpret data, perform statistical analysis, and generate visual representations to provide insights saving plots as .PNG image""", - instructions= f"""Consult the CSV file provided and thorught code make the respective dana analysis, take this advices: - - Provide detailed insights and interpretations based on the analysis results - - Use Python and its libraries (e.g., pandas, matplotlib, seaborn) to execute data manipulation, statistical analysis, and data visualization - - Answer questions regarding the dataset, including summaries, correlations, trends, and anomalies""", - model="gpt-4", - tools=[{"type": "code_interpreter"}], - file_ids=[file.id],) - - # Generate a Thread for the chat - thread = _tools_utils.client.beta.threads.create() - - # Data stored in the memory - agentMemory = { - "assistant_id": assistant.id, - "thread_id": thread.id, - "file_id": file.id,} - - # Save the memory - with open(memory_path, "w") as json_file: - json.dump(agentMemory, json_file) - -else: - # Retrieve the agent based on the ID - assistant_retrieved =_tools_utils.client.beta.assistants.retrieve(agentMemory['assistant_id']) - -# Add the prompt to the thread -message = _tools_utils.client.beta.threads.messages.create( - thread_id= agentMemory['thread_id'], - role="user", - content=prompt,) - -#Get the amount of messages in the thread -messages = _tools_utils.client.beta.threads.messages.list(thread_id=agentMemory['thread_id']) -data = json.loads(messages.model_dump_json(indent=2)) # Load JSON data into a Python object -messages_count=len(data["data"]) - -# Execute the thread with the assistant -run = _tools_utils.client.beta.threads.runs.create( - thread_id=agentMemory['thread_id'], - assistant_id=agentMemory['assistant_id']) - -time.sleep(5) - -while True: - # Retrieve the status of the run - run = _tools_utils.client.beta.threads.runs.retrieve( - thread_id=agentMemory['thread_id'], - run_id=run.id, - ) - - status = run.status - - if status != "in_progress" and status != "queued": - break - - time.sleep(10) - -#Get all the messages -messages = _tools_utils.client.beta.threads.messages.list(thread_id=agentMemory['thread_id']) - -data = json.loads(messages.model_dump_json(indent=2)) -new_message_count=len(data["data"]) - messages_count - -for i in range(new_message_count): - content = data['data'][i].get('content', []) - for message in content: - ### Image Generated - if 'image_file' in message: - file_id=message['image_file']['file_id'] - content = _tools_utils.client.files.content(file_id) - image= content.write_to_file(f"image_{file_id}.png") - ### Text Generated - if 'text' in message: - print(message['text']['value']) \ No newline at end of file diff --git a/apps/hal9/tools/calculator.py b/apps/hal9/tools/calculator.py index 6a4e2ed1..34fd93b3 100644 --- a/apps/hal9/tools/calculator.py +++ b/apps/hal9/tools/calculator.py @@ -1,8 +1,32 @@ -def calculate(expression): - """Use this tool to provide a solution to mathematical problems with a expression using Python code -Parameters: - 'expression' = is the aritmetic operations to evaluate, needs conversion to proper Python syntax. - """ - result = eval(expression) - print(result) - return result \ No newline at end of file +from utils import stream_print + +def solve_math_problem(steps_explanation, code_solution): + stream_print("Steps:\n") + stream_print(steps_explanation) + stream_print("\n\nPython Code:\n") + exec(code_solution) + return f"Steps:\n{steps_explanation}\n\n\nPython Code: {code_solution}" + +solve_math_problem_description = { + "type": "function", + "function": { + "name": "solve_math_problem", + "description": "This function provides solutions to mathematical problems by offering both a step-by-step breakdown of the problem-solving process and a Python code implementation. It ensures clarity by explaining relevant concepts, formulas, and logic, while also demonstrating how the solution can be executed programmatically.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "steps_explanation": { + "type": "string", + "description": "A comprehensive, step-by-step description of how to solve the specified mathematical problem, including relevant formulas and concepts.", + }, + "code_solution": { + "type": "string", + "description": "A complete Python script with imports that executes the described solution, clearly demonstrating the implementation of each step and outputting the final answer with a print.", + }, + }, + "required": ["steps_explanation", "code_solution"], + "additionalProperties": False, + }, + } +} diff --git a/apps/hal9/tools/csv.py b/apps/hal9/tools/csv.py deleted file mode 100644 index 4db0a8cd..00000000 --- a/apps/hal9/tools/csv.py +++ /dev/null @@ -1,72 +0,0 @@ -import hal9 as h9 -import openai -import os -import sys -import pandas as pd -import json - -envvars = os.environ -client = openai.AzureOpenAI( - azure_endpoint = 'https://openai-hal9.openai.azure.com/', - api_key = os.environ['OPENAI_AZURE'], - api_version = '2023-05-15', -) - -def get_system_prompt(connection): - df = pd.read_csv(connection) - - x = pd.DataFrame(df.dtypes).reset_index().rename({'index': 'column', 0: 'data type'}, axis='columns') - x['data type'] = x['data type'].apply(lambda x: str(x)) - schema = x.to_json(orient='records', lines=True) - - def truncate_string(s, chars): - return s[:chars] + '...' if len(s) > chars else s - head = df.head().applymap(lambda x: truncate_string(x, 40) if isinstance(x, str) else x) - head = head.to_string() - - return f"""You are an assistant that writes python code for a streamlit app using plotly given a url to a CSV stored in '{connection}'. Your goal is to fullfill user requests. Load the file from the given URL. The CSV has columns and dtypes: - - {schema} - - The contents of the csv look like: - - {head} - - """ - -def csv_reply(prompt): - """ - Can understand links to CSVs and use them to reply to questions - 'prompt' one of two options: (1) A link to a new CSV or (2) A question about a previous CSV link - """ - - messages = h9.load("csv-messages", []) - if h9.is_url(prompt): - messages = [{ - "role": "system", - "content": get_system_prompt(prompt) - }, { - "role": "user", - "content": "Acknowledge you understand the CSV that was provided" - }] - else: - messages.append({"role": "user", "content": prompt}) - - completion = client.chat.completions.create( - model = "gpt-4", - messages = messages, - temperature = 0, - frequency_penalty=0.3, - ) - - response = completion.choices[0].message.content - messages.append({"role": "assistant", "content": response}) - h9.save("csv-messages", messages, hidden = True) - - code = h9.extract(markdown=response, language="python") - if code is None: - print(response) - else: - h9.save("app.py", code) - - return "You now have access to a CSV file and can use this tool again for follow up questions" diff --git a/apps/hal9/tools/csv_agent.py b/apps/hal9/tools/csv_agent.py new file mode 100644 index 00000000..289e6bb3 --- /dev/null +++ b/apps/hal9/tools/csv_agent.py @@ -0,0 +1,509 @@ +from utils import stream_print +import pandas as pd +from utils import generate_response, load_messages, insert_message, execute_function, save_messages, insert_tool_message +import traceback +import os +import shutil + +########################### Functions ########################## + +def load_data(file_path): + return pd.read_csv(file_path) + +# Data Overview +def data_overview(csv_path): + df = load_data(csv_path) + table_name = "Dataset Overview" + row_count = df.shape[0] + column_count = df.shape[1] + columns_info = "\n".join([f"- {col}: {df[col].dtype}" for col in df.columns]) + data_preview = df.head().to_string(index=False) + + return f""" + **Data Overview** + - Table Name: {table_name} + - Number of Rows: {row_count} + - Number of Columns: {column_count} + - Column Names and Data Types: + {columns_info} + + **Data Preview:** + {data_preview}""" + +# Numeric Columns Summary +def numeric_columns_summary(csv_path): + df = load_data(csv_path) + numeric_columns = df.select_dtypes(include=['number']).columns + numeric_stats = "" + + for col in numeric_columns: + stats = df[col].describe(percentiles=[.25, .5, .75]) + numeric_stats += f"\n- {col}:\n Mean: {stats['mean']}\n Median: {stats['50%']}\n"\ + f" Std Dev: {stats['std']}\n Min: {stats['min']}\n Max: {stats['max']}\n"\ + f" 25th, 50th, 75th percentiles: {stats['25%']}, {stats['50%']}, {stats['75%']}\n" + + return f""" + **Numeric Columns Summary** + + **Numeric Columns:** + {numeric_stats} + """ + +# Categorical Columns Summary +def categorical_summary(csv_path): + df = load_data(csv_path) + categorical_columns = df.select_dtypes(exclude=['number']).columns + categorical_stats = "" + + for col in categorical_columns: + value_counts = df[col].value_counts() + mode = value_counts.idxmax() + mode_freq = value_counts.max() + categorical_stats += f"\n- {col}:\n Unique values: {df[col].nunique()}\n"\ + f" Mode: {mode} (Frequency: {mode_freq})\n Frequencies:\n{value_counts.to_string()}\n" + + return f""" + **Categorical Columns Summary** + + **Categorical Columns:** + {categorical_stats} + """ + +# Missing Values Analysis +def missing_values_analysis(csv_path): + df = load_data(csv_path) + missing_values_info = df.isnull().sum() + missing_percentage = (df.isnull().mean() * 100).round(2) + + missing_summary = "\n".join([f"- {col}: {missing_values_info[col]} ({missing_percentage[col]}%)" + for col in df.columns if missing_values_info[col] > 0]) + + return f""" + **Missing Values Analysis** + + **Columns with Missing Values:** + {missing_summary} + """ + +# Single Column Analysis +def column_analysis(csv_path, column_name): + df = load_data(csv_path) + if column_name not in df.columns: + return f"Column '{column_name}' does not exist in the dataset." + + col_data = df[column_name] + col_info = f"\n- Column: {column_name}\n Data Type: {col_data.dtype}\n Unique Values: {col_data.nunique()}" + + if pd.api.types.is_numeric_dtype(col_data): + col_info += f"\n Mean: {col_data.mean():.2f}\n Std Dev: {col_data.std():.2f}\n"\ + f" Min: {col_data.min()}\n Max: {col_data.max()}\n" + else: + value_counts = col_data.value_counts() + mode = value_counts.idxmax() + mode_freq = value_counts.max() + col_info += f"\n Mode: {mode} (Frequency: {mode_freq})\n Frequencies:\n{value_counts.to_string()}\n" + + return f""" + **Single Column Analysis: {column_name}** + + {col_info} + """ +def generate_subdataframe(csv_path, code): + context = {} + context['csv_path'] = csv_path + try: + exec(code, context) + return context['result'] + except Exception as e: + return (f"Error executing the code': {e}") + +def generate_plot(csv_path, code): + context = {} + context['csv_path'] = csv_path + try: + exec(code, context) + return context['result'] + except Exception as e: + return (f"Error executing the code': {e}") + +def generate_dashboard(csv_path, code): + context = {} + context['csv_path'] = csv_path + directory = "./.storage/app.py" # Target directory for the Python file + file_name = os.path.basename(csv_path) + + try: + # Execute the provided Python code + exec(code, context) + code = code.replace(csv_path, file_name) + # Ensure the target directory exists + if not os.path.exists(directory): + os.makedirs(directory) + + # Write the code to the app.py file + python_file_path = os.path.join(directory, "app.py") + with open(python_file_path, 'w') as file: + file.write(code) + + # Copy the CSV file to the same directory + csv_destination = os.path.join(directory, os.path.basename(csv_path)) + shutil.copy(csv_path, csv_destination) + + return "The app is running properly" + except Exception as e: + # Handle exceptions and provide a detailed traceback + tb = traceback.format_exc() + relevant_error_info = tb.splitlines() + last_line = relevant_error_info[-1] + return f"Unable to run the dashboard generated, an error has occurred -> {last_line} ... Complete traceback: {tb}" + +def generate_print_and_filter(csv_path, code): + context = {} + context['csv_path'] = csv_path + try: + exec(code, context) + return context['result'] + except Exception as e: + return (f"Error executing the code': {e}") + +def fix_python_code(csv_path, code): + context = {} + context['csv_path'] = csv_path + directory = "./.storage/app.py" # Target directory for the Python file + file_name = os.path.basename(csv_path) + code = code.replace(csv_path, file_name) + try: + # Execute the provided Python code + exec(code, context) + code = code.replace(csv_path, file_name) + # If the code contains "streamlit", handle folder creation and file operations + if "streamlit" in code: + # Ensure the target directory exists + if not os.path.exists(directory): + os.makedirs(directory) + + # Write the code to the app.py file + python_file_path = os.path.join(directory, "app.py") + with open(python_file_path, 'w') as file: + file.write(code) + + # Move the CSV file to the same directory + csv_destination = os.path.join(directory, os.path.basename(csv_path)) + shutil.copy(csv_path, csv_destination) + + return "The app is running properly" + + return "The code now works perfectly" + except Exception as e: + # Handle exceptions and provide a detailed traceback + tb = traceback.format_exc() + relevant_error_info = tb.splitlines() + last_line = relevant_error_info[-1] + return f"An error has occurred again -> {last_line} ... Complete traceback: {tb}" + +def final_response(final_message): + stream_print(final_message) + return final_message + +########################### Descriptions ########################## + +analyze_csv_description = { + "type": "function", + "function": { + "name": "analyze_csv", + "description": "Performs data analysis on the provided CSV file. It can answer questions related to the data, generate insights, summarize key metrics, and create visualizations such as plots to help the user better understand the content. This tool supports both descriptive statistics and data-driven insights.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file that will be analyzed", + }, + "user_query": { + "type": "string", + "description": "A question or query related to the CSV data.", + }, + }, + "required": ["csv_path", "user_query"], + "additionalProperties": False, + }, + } +} + +data_overview_description = { + "type": "function", + "function": { + "name": "data_overview", + "description": "Provides an overview of the dataset, including the table name, row and column counts, data types of each column, and a preview of the first few rows, your first step to understand the data.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file to be summarized.", + }, + }, + "required": ["csv_path"], + "additionalProperties": False, + }, + } +} + +numeric_columns_summary_description = { + "type": "function", + "function": { + "name": "numeric_columns_summary", + "description": "Generates a summary of all numeric columns in the provided DataFrame, use just in case that requires deeper information, including key statistics such as mean, median, standard deviation, and percentiles.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file to be summarized.", + }, + }, + "required": ["csv_path"], + "additionalProperties": False, + }, + } +} + +categorical_summary_description = { + "type": "function", + "function": { + "name": "categorical_summary", + "description": "Summarizes categorical columns in the provided DataFrame, use just in case that requires deeper information, by reporting unique values, mode, frequency, and value counts for each categorical column.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file to be summarized.", + }, + }, + "required": ["csv_path"], + "additionalProperties": False, + }, + } +} + +missing_values_analysis_description = { + "type": "function", + "function": { + "name": "missing_values_analysis", + "description": "Analyzes the DataFrame for missing values, providing counts and percentages of missing values for each column, use just in case that is been asked for missing values report.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file to be summarized.", + }, + }, + "required": ["csv_path"], + "additionalProperties": False, + }, + } +} + +column_analysis_description = { + "type": "function", + "function": { + "name": "column_analysis", + "description": "Analyzes a specific column in a CSV dataset, use just in case that is been asked about and specific column, providing details such as data type, unique values, mean, standard deviation, minimum, and maximum values for numeric columns, or mode and frequency counts for categorical columns.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The file path to the CSV file that will be read into a DataFrame." + }, + "column_name": { + "type": "string", + "description": "The name of the column to analyze within the dataset." + }, + }, + "required": ["csv_path", "column_name"], + "additionalProperties": False, + }, + } +} + +generate_subdataframe_description = { + "type": "function", + "function": { + "name": "generate_subdataframe", + "description": "Executes custom Python code to extract specific information from a CSV file. The code provided must load the CSV data and store the subdataframe in path './.storage/'", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The file path to the CSV file that will be loaded into a DataFrame for processing." + }, + "code": { + "type": "string", + "description": "A string containing the Python code to be executed. This code should be a complete script including imports, the dataframe must load the CSV data from the 'csv_path' parameter, perform necessary operations, and store the result in a new CSV into the path './.storage/'" + } + }, + "required": ["csv_path", "code"], + "additionalProperties": False + } + } +} + +generate_plot_description = { + "type": "function", + "function": { + "name": "generate_plot", + "description": "Executes custom Python code to process data from a CSV file and generate a plot from plotly Express and export a final image with an adapted size based on amount of data with Kaleido as JPEG into the path './.storage/, be sure of add numbers into the bars and lines of each plot.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The file path to the CSV file that will be read and used for generating a plot." + }, + "code": { + "type": "string", + "description": "A string containing Python code to execute. This code should be a complete script including imports, the dataframe must load the CSV data from the 'csv_path' parameter, perform any necessary analysis or transformations, and generate a great plot, with labels of the numbers in each line or bar, finally store the plot image with Kaleido and save the filename into a 'result' variable." + } + }, + "required": ["csv_path", "code"], + "additionalProperties": False + } + } +} + +generate_dashboard_description = { + "type": "function", + "function": { + "name": "generate_dashboard", + "description": "Generates a Streamlit app to present a dashboard with filters and plots from Plotly Express. The code must load the CSV data from the 'csv_path' parameter", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file that will be used to generate the dashboard data." + }, + "code": { + "type": "string", + "description": "A string containing Python code to execute. This code should be a complete script including imports, any necessary calculations, streamlit components and finally a plot or table display" + } + }, + "required": ["csv_path", "code"], + "additionalProperties": False + } + } +} + +generate_print_and_filter_description = { + "type": "function", + "function": { + "name": "generate_print_and_filter", + "description": "Executes custom Python code to process data from a CSV file and generate a print statement that answer and specific request, the string to be printed is stored into a variable 'result'", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file that will be used to generate the response." + }, + "code": { + "type": "string", + "description": "A string containing Python code to execute. This code should be a complete script including imports,The code must load the CSV data from the 'csv_path' parameter, any necessary calculations, and finally store the string to be print into a 'result' variable" + } + }, + "required": ["csv_path", "code"], + "additionalProperties": False + } + } +} + +fix_python_code_description = { + "type": "function", + "function": { + "name": "fix_python_code", + "description": "Read carefully the last python code generated and the error returned and rewrite the code to fix the error, if is necessary before of use this tool get a deeper review of the data that could fail", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "csv_path": { + "type": "string", + "description": "The path to the CSV file that will be used to generate the response." + }, + "code": { + "type": "string", + "description": "A string containing the fixed Python code to execute. Again is a complete script including imports, and solving the error" + } + }, + "required": ["csv_path", "code"], + "additionalProperties": False + } + } +} + +final_response_description = { + "type": "function", + "function": { + "name": "final_response", + "description": "This function is called when all necessary information has been gathered through the tools, and the response is ready to be sent to the user. It finalizes the process and delivers the results in a clear and concise message.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "final_message": { + "type": "string", + "description": "A clear and concise message that in simple terms mentions all the tools called to obtain the information neccesary. It explains how the information was gathered and whats the final insight.", + }, + }, + "required": ["final_message"], + "additionalProperties": False, + }, + } +} + + +########################### Main Agent ############################ + +def analyze_csv(csv_path, user_query): + tools_descriptions = [data_overview_description, numeric_columns_summary_description, categorical_summary_description, missing_values_analysis_description, column_analysis_description, generate_dashboard_description, generate_plot_description, generate_subdataframe_description, generate_print_and_filter_description, fix_python_code_description, final_response_description] + tools_functions = [data_overview, numeric_columns_summary, categorical_summary, missing_values_analysis, column_analysis, generate_dashboard, generate_plot, generate_subdataframe, generate_print_and_filter, fix_python_code, final_response] + + # load messages + messages = load_messages(file_path="./.storage/.csv_messages.json") + + if len(messages) < 1: + messages = insert_message(messages, "system", f"""You are a data analysis system specialized in carefully analyzing CSV files using a serie of tools. + Approach this task with a step-by-step methodology using the minimum necessary tools, + examining the data(with the data_overview tool, limitate the numerical and categorical tools just for specific questions) thoroughly + before moving on to generate any code(dashboard, print, plots, subdataframes) . Avoid making assumptions about columns, data formats. + If a tool return and error use your fix_python_code tool. + Once you have collected all the required information to solve the user input generate a final response that starts with 'Final Response:'.""") + messages = insert_message(messages, "user", f"Answer this request -> {user_query} , with this CSV_path = '{csv_path}'") + + steps = 0 + max_steps = 10 + while steps < max_steps: + response = generate_response("openai", "gpt-4-turbo", messages, tools_descriptions, tool_choice = "required", parallel_tool_calls=False) + tool_result = execute_function(response, tools_functions) + insert_tool_message(messages, response, tool_result) + save_messages(messages, file_path="./.storage/.csv_messages.json") + response_message = response.choices[0].message + tool_calls = getattr(response_message, 'tool_calls', None) + if tool_calls[0].function.name == "final_response": + return tool_result + + return "I was unable to find a solution in time" \ No newline at end of file diff --git a/apps/hal9/tools/document.py b/apps/hal9/tools/document.py deleted file mode 100644 index bf56746c..00000000 --- a/apps/hal9/tools/document.py +++ /dev/null @@ -1,226 +0,0 @@ -import hal9 as h9 -import openai -import os -import json -import pandas as pd -import requests -import fitz - -import warnings -warnings.simplefilter(action='ignore') - -from langchain_openai import AzureOpenAIEmbeddings -from io import BytesIO -from langchain.text_splitter import RecursiveCharacterTextSplitter -from sklearn.metrics.pairwise import cosine_similarity -from google.cloud import bigquery -from google.oauth2 import service_account - -envvars = os.environ -connection = envvars['CONNECTION'] - -embeddings = AzureOpenAIEmbeddings( - client = None, - azure_deployment = "text-embedding-3-large", - azure_endpoint = 'https://openai-hal9.openai.azure.com/', - api_key = os.environ['OPENAI_AZURE'], - openai_api_type = 'azure', - chunk_size = 3000, -) - -client = openai.AzureOpenAI( - azure_endpoint = 'https://openai-hal9.openai.azure.com/', - api_key = os.environ['OPENAI_AZURE'], - api_version = '2023-05-15', -) - -def similarity(embedding1, embedding2): - return cosine_similarity([embedding1], [embedding2])[0][0] - -def similar_text(prompt, df, k=10): - embedded_prompt = embeddings.embed_query(prompt) - similarities = df['embeddings'].apply(lambda x: similarity(embedded_prompt, x)) - top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:k] - return df.iloc[top_indices] - -def get_PDFresponse(request, set_verbose): - files_schema_df = h9.load("document-embeddings", []) - h9.save("document-embeddings", files_schema_df, hidden = True) - - relevant_passages_text = [] - files_consulted=[] - files_consulted2=[] - relevant_passages_rows = similar_text(request, files_schema_df, 5) - relevant_passages_text += relevant_passages_rows['chunk_content'].to_list() - files_consulted += relevant_passages_rows["filename"].to_list() - - files_schema_df = files_schema_df.drop(relevant_passages_rows.index.tolist()).reset_index(drop=True) - - for index, row in relevant_passages_rows.iterrows(): - related_passages = similar_text(row['chunk_content'], files_schema_df, 2) - files_schema_df = files_schema_df.drop(related_passages.index.tolist()).reset_index(drop=True) - relevant_passages_text += related_passages['chunk_content'].to_list() - files_consulted2 += related_passages["filename"].to_list() - - text_passages_list = str(relevant_passages_text) - return call_with_context(request, text_passages_list, set_verbose, files_consulted) - -def processPDF(connection): - #print("Processing text... please wait") - urls_list = connection.split(',') - text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=500) - - url_list_df = [] - filename_list = [] - chunk_content_list = [] - embedding_list = [] - - for url in urls_list: - filename = url.split("/")[-1] - response = requests.get(url) - pdf_document = fitz.open(stream=BytesIO(response.content)) - all_text = ''.join(page.get_text() for page in pdf_document.pages()) - pdf_document.close() - text_chunks = text_splitter.split_text(all_text) - chunks_strings = [(f"From {filename}: "+ str(chunk)) for chunk in text_chunks] - embedding_results = embeddings.embed_documents(chunks_strings) - - url_list_df.extend([url] * len(text_chunks)) - filename_list.extend([filename] * len(text_chunks)) - chunk_content_list.extend(chunks_strings) - embedding_list = embedding_list + embedding_results - - files_schema_df = pd.DataFrame({ - 'url': url_list_df, - 'filename': filename_list, - 'chunk_content': chunk_content_list, - 'embeddings': embedding_list - }) - h9.save("document-embeddings", files_schema_df, hidden = True) - h9.save("document-summary", get_PDFresponse("Write a very short summary about these documents", False), hidden = True) - - return files_schema_df - -def call_with_context(chat_input, added_context, verbose, files_consulted): - messages2 = h9.load("document-messages", [{ "role": "system", "content": "" }]) - messages2[0]["content"] = "You are operating a chatbot designed to answer questions based solely on specific text passages provided from a PDF file. Each response you generate should rely entirely on these passages, without inferring or assuming additional information. Ensure all answers are directly supported by the text content of the passages." - messages2.append({"role": "user", "content": f"""Request: {chat_input} - Text passages: {added_context} - """}) - completion2 = client.chat.completions.create( - model = "gpt-4", - messages = messages2, - stream = True - ) - - collected_messages2 = [] - - for chunk in completion2: - if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: - chunk_message = chunk.choices[0].delta.content # extract the message - collected_messages2.append(chunk_message) # save the message - if(chunk_message is not None): - if(verbose): - print(chunk_message, end="") - collected_messages2 = [m for m in collected_messages2 if m is not None] - full_reply_content2 = ''.join(collected_messages2) - - if (verbose): - print("\n\nSources: " + ", ".join(list(set(files_consulted)))) - - return full_reply_content2 - -def conversation_loop(messages, chat_input): - # Append the new user input - messages.append({"role": "user", "content": chat_input}) - - # Create a completion request with streaming enabled - completion = client.chat.completions.create( - model = "gpt-4", - messages = messages, - functions = [ - { - "name": "get_PDFresponse", - "description": f"""Gets a response from the PDF text relying entirely on the text using a LLM+RAG tool given a request that describes what is to be retrieved by the tool. Use this tool if the conversation could use answers from these documents.""", - "parameters": { - "type": "object", - "properties": { - "request": {"type": "string"} - }, - "required": ["request"] - } - } - ], - function_call = "auto", - stream = True - ) - - # Iterate over the streamed responses - full_response = "" - function_call_info = None - function_call_data_complete = False - - toolname ="" - - # create variables to collect the stream of chunks - collected_messages = [] - collected_messages_name = [] - - tool = False - - for chunk in completion: - if(chunk.choices[0].delta.function_call == None): - if chunk.choices and chunk.choices[0].delta and chunk.choices[0].delta.content: - chunk_message = chunk.choices[0].delta.content # extract the message - collected_messages.append(chunk_message) # save the message - print(chunk_message, end="") - else: - tool = True - chunk_message = chunk.choices[0].delta.function_call.arguments # extract the message - collected_messages.append(chunk_message) # save the message - if(chunk.choices[0].delta.function_call.name != None): - toolname = chunk.choices[0].delta.function_call.name - - if(tool): - collected_messages = [m for m in collected_messages if m is not None] - full_reply_content = ''.join(collected_messages) - if(toolname == "get_PDFresponse"): - data = json.loads(full_reply_content) - full_reply_content = get_PDFresponse(data.get("request"), True) - else: - full_reply_content = "No response" - else: - collected_messages = [m for m in collected_messages if m is not None] - full_reply_content = ''.join(collected_messages) - - messages.append({"role": "assistant", "content": full_reply_content}) - h9.save("document-messages", messages, hidden = True) - return full_reply_content - -def document_reply(prompt): - """ - Can understand links to PDFs and use them to reply to questions - 'prompt' one of two options: (1) A link to a new PDF or (2) A question about a previous PDF link - """ - - if h9.is_url(prompt): - embeddings = h9.load("document-embeddings", processPDF(prompt)) - prompt = "Acknowledge you understand the document that was provided" - else: - embeddings = h9.load("document-embeddings", False) - - if embeddings is False or embeddings.empty: - print("No PDF file found, please provide at least one") - return "No PDF file found, please provide at least one" - - docs_summary = h9.load("document-summary", False) - if not docs_summary: - print("I was not able to summarize the documents") - return "Could not summarize the documents" - - messages = h9.load("document-messages", [{ - "role": "system", - "content": f"""You are an assistant that helps the user anser questions based on documentation provided by the chatbot creator. You can use a tool to consult the documentation using a RAG system when you need to get data from the files. A summary of the PDFs follows: {docs_summary} . Try not to mention the filename of the documentation you have access to.""" - }]) - - return conversation_loop(messages, prompt) diff --git a/apps/hal9/tools/game.py b/apps/hal9/tools/game.py deleted file mode 100644 index 89d04ca6..00000000 --- a/apps/hal9/tools/game.py +++ /dev/null @@ -1,81 +0,0 @@ -import openai -import os -import hal9 as h9 -import json -import time - -client = openai.AzureOpenAI( - azure_endpoint='https://openai-hal9.openai.azure.com/', - api_key=os.environ['OPENAI_AZURE'], - api_version='2023-05-15', -) - -def build_game(user_game_request): - """Use this tool when a user explicitly requests to build a video game or provides a brief description resembling a video game concept. -Parameters: - 'user_game_request' = is the requested user game to build. - """ - number_of_steps = 3 - - print('OK, will get that started. ', end="") - - prompt_text = client.chat.completions.create( - model="gpt-4", - messages=[{"role": "user", "content": f"(Do not mention preloaded assets within the array) Make a string array of only text in JSON format that includes {number_of_steps} text elements where each text element describes an important instruction for generating the following user request as a pure single page HTML game: {user_game_request}, The JSON array must be flat and only contain strings"}], - temperature=0, - ) - - response = prompt_text.choices[0].message.content - prompts = h9.extract(markdown=response, language="json") - prompts = json.loads(prompts) - prompts[0] = prompts[0] + ". The background of the html page must be radial gradient with a color appropriate to the game and a short fun title for the game." - - messages = h9.load("messages-game", [{"role": "system", "content": "Always reply with a single page HTML markdown block (which can use JavaScript, CSS, etc) that fulfills the user request and only use geometric shapes and colors for the single page HTML markdown block"}]) - - print('For each step I complete there will be a generated game to go along with it! So you can see the progress of the game I am creating!\n') - - def improve_code(messages, prompt): - messages.append({"role": "user", "content": prompt}) - - completion = client.chat.completions.create( - model="gpt-4", - messages=messages, - temperature=0, - ) - - response = completion.choices[0].message.content - messages.append({"role": "assistant", "content": response}) - - code = h9.extract(markdown=response, language="html") - return code - - for i, prompt in enumerate(prompts): - - formatted_prompt = prompt.format(user_game_request=user_game_request) - - if (i == 0): - code = improve_code(messages, formatted_prompt + background) - else: - code = improve_code(messages, f"""Fix/improve the following code by following the instruction: - - ```html - {code} - ``` - - Instruction: {formatted_prompt} (Avoid Placeholders: Ensure the code is complete and functional, avoiding the use of placeholders.) - """) - - print(f'Game iteration {i + 1}. {prompt}. ', end="") - h9.save(f"game-{int(time.time())}.html", code, hidden=False) - - print(f"Completed!") - - h9.save("messages-game", messages, hidden=True) - - explanation = client.chat.completions.create( - model="gpt-4", - messages=[{"role": "system", "content": "Your job is to explain the code in games in simple terms to explain users what to expect."}, {"role": "user", "content": f"{code}\n\nExplain what the game does in simple terms:"}], - temperature=0, - ) - - return explanation.choices[0].message.content diff --git a/apps/hal9/tools/generic.py b/apps/hal9/tools/generic.py index 569c27ba..8eb67df3 100644 --- a/apps/hal9/tools/generic.py +++ b/apps/hal9/tools/generic.py @@ -1,30 +1,33 @@ from groq import Groq -import os -import hal9 as h9 -import json +from utils import stream_print -def generic_reply(prompt): - """Use this tool for general knowledge questions. If unsure which tool to select, default to this one. -Parameters: - 'prompt' = user input - """ - - messages = h9.load("messages", []) - messages = [msg for msg in messages if ("tool_calls" not in msg and "tool_call_id" not in msg)] - - response = Groq().chat.completions.create( +def answer_generic_question(user_input): + response = Groq().chat.completions.create( model = "llama3-70b-8192", - messages = messages, - temperature = 0, - seed = 1) - - stream = Groq().chat.completions.create(model = "llama3-70b-8192", messages = messages, temperature = 0, seed = 1, stream = True) - - response = "" - for chunk in stream: - if len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None: - print(chunk.choices[0].delta.content, end="") - response += chunk.choices[0].delta.content + messages = [{"role": "user", "content": user_input}], + temperature = 0, + seed = 1) - return response + text_response = response.choices[0].message.content + stream_print(text_response) + return text_response +answer_generic_question_description = { + "type": "function", + "function": { + "name": "answer_generic_question", + "description": "Handles general questions or queries provided by the user by taking their input and generating a meaningful response.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "user_input": { + "type": "string", + "description": "Take the user input and pass the same string to the function", + }, + }, + "required": ["user_input"], + "additionalProperties": False, + }, + } +} \ No newline at end of file diff --git a/apps/hal9/tools/hal9.py b/apps/hal9/tools/hal9.py index 2143c25d..24828590 100644 --- a/apps/hal9/tools/hal9.py +++ b/apps/hal9/tools/hal9.py @@ -1,26 +1,34 @@ +from data import DATA from groq import Groq -import os -import hal9 as h9 -import json +from utils import stream_print -def hal9_reply(prompt): - """ - Reply to questions about Hal9. - 'prompt' to respond to. - """ +def answer_hal9_questions(user_input): + response = Groq().chat.completions.create( + model = "llama3-70b-8192", + messages = [{"role": "system", "content": DATA["hal9"]},{"role": "user", "content": user_input}], + temperature = 0, + seed = 1) - context = open('tools/hal9.txt', 'r').read() - messages = [ - {"role": "system", "content": context}, - {"role": "user", "content": prompt} - ] + text_response = response.choices[0].message.content + stream_print(text_response) + return text_response - stream = Groq().chat.completions.create(model = "llama3-70b-8192", messages = messages, temperature = 0, seed = 1, stream = True) - - response = "" - for chunk in stream: - if len(chunk.choices) > 0 and chunk.choices[0].delta.content is not None: - print(chunk.choices[0].delta.content, end="") - response += chunk.choices[0].delta.content - - return response +answer_hal9_questions_description = { + "type": "function", + "function": { + "name": "answer_hal9_questions", + "description": "Handles questions related to Hal9 or this chatbot-web capabilities", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "user_input": { + "type": "string", + "description": "Take the user input and pass the same string to the function", + }, + }, + "required": ["user_input"], + "additionalProperties": False, + }, + } +} \ No newline at end of file diff --git a/apps/hal9/tools/image.py b/apps/hal9/tools/image.py deleted file mode 100644 index 8afc1265..00000000 --- a/apps/hal9/tools/image.py +++ /dev/null @@ -1,23 +0,0 @@ -from PIL import Image -from io import BytesIO -import shutil -import replicate - -def create_image(prompt): - """ - Creates an image or photograph for the user - 'prompt' with the description of the image or photograph - """ - filename = "hal9-flux.jpg" - try: - output = replicate.run("black-forest-labs/flux-dev", input={"prompt": prompt}) - encoded_image = output[0].read() - image = Image.open(BytesIO(encoded_image)) - - image.save(filename, format="JPEG") - - shutil.copy(filename, f".storage/{filename}") - - return f"Generated a {filename} that {prompt}" - except Exception as e: - return f"Couldn't generate that image. Please try a different prompt. \n\n Error: {e}" \ No newline at end of file diff --git a/apps/hal9/tools/image_agent.py b/apps/hal9/tools/image_agent.py new file mode 100644 index 00000000..1c7eb0cf --- /dev/null +++ b/apps/hal9/tools/image_agent.py @@ -0,0 +1,377 @@ +import shutil +import replicate +from utils import generate_response, load_messages, insert_message, execute_function, save_messages, insert_tool_message, load_json_file +from PIL import Image +from io import BytesIO +from clients import azure_openai_client +import os +import base64 +from mimetypes import guess_type +import json + +########################### Functions ########################## + +def add_images_descriptions(image_path): + description = generate_description(image_path) + + file_name = './.storage/.images_description.json' + + if os.path.exists(file_name): + with open(file_name, 'r') as file: + data = json.load(file) + else: + data = [] + + new_record = { + "image_path": image_path, + "image_description": description + } + + data.append(new_record) + + with open(file_name, 'w') as file: + json.dump(data, file, indent=4) + + return description + +def generate_img_url(image_path): + mime_type, _ = guess_type(image_path) + if mime_type is None: + mime_type = 'application/octet-stream' + + with open(image_path, "rb") as image_file: + base64_encoded_data = base64.b64encode(image_file.read()).decode('utf-8') + + return f"data:{mime_type};base64,{base64_encoded_data}" + +def generate_description(image_path): + try: + file_input = open(image_path, 'rb') + input = { + "image": file_input, + "prompt": """Generate a detailed image prompt that includes all specific visual details in the image. This should include precise descriptions of colors, textures, lighting, positions of all elements, proportions, background details, + foreground details, and any unique stylistic choices. Ensure the description is exhaustive enough to allow an artist or AI to recreate the image accurately without visual reference.""" + } + + description = "" + for event in replicate.stream( + "yorickvp/llava-13b:80537f9eead1a5bfa72d5ac6ea6414379be41d4d4f6679fd776e9535d1eb58bb", + input=input + ): + description+=event.data + file_input.close() + except Exception as e: + return (f"Couldn't describe that image. -> Error: {e}") + + return description + +def image_generator(prompt, filename): + try: + output = replicate.run("black-forest-labs/flux-dev", input={"prompt": prompt}) + except Exception as e: + return (f"Couldn't generate that image. Please try a different prompt. -> Error: {e}") + + encoded_image = output[0].read() + image = Image.open(BytesIO(encoded_image)) + + image.save(filename, format="JPEG") + + shutil.copy(filename, f".storage/{filename}") + + generated_description = add_images_descriptions(f".storage/{filename}") + + print(generated_description) + return f"The image generated is: {generated_description} \n\n Saved in path: '.storage/{filename}'" + +def image_analyzer(image_path, prompt): + image_url = generate_img_url(image_path) + response = azure_openai_client.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url" : {"url": image_url,}, + }, + ], + } + ], + ) + print(response.choices[0].message.content) + return response.choices[0].message.content + +def edition_canny_model(image_path, modified_description, filename): + try: + file_input = open(image_path, 'rb') + output = replicate.run( + "black-forest-labs/flux-canny-pro", + input={ + "steps": 50, + "prompt": modified_description, + "guidance": 30, + "control_image": file_input, + "output_format": "jpg", + "safety_tolerance": 2, + "prompt_upsampling": False + } + ) + file_input.close() + except Exception as e: + return (f"Couldn't generate that image. Please try a different prompt. -> Error: {e}") + encoded_image = output.read() + image = Image.open(BytesIO(encoded_image)) + + image.save(filename, format="JPEG") + + shutil.copy(filename, f".storage/{filename}") + + generated_description = add_images_descriptions(f".storage/{filename}") + print(generated_description) + return f"The image generated is: {generated_description} \n\n Saved in path: '.storage/{filename}'" + +def edition_depth_model(image_path, modified_description, filename): + try: + file_input = open(image_path, 'rb') + output = replicate.run("black-forest-labs/flux-depth-pro", + input={ + "steps": 50, + "prompt": modified_description, + "guidance": 7, + "control_image": file_input, + "output_format": "jpg", + "safety_tolerance": 2, + "prompt_upsampling": False + } + ) + file_input.close() + except Exception as e: + return (f"Couldn't generate that image. Please try a different prompt. -> Error: {e}") + encoded_image = output.read() + image = Image.open(BytesIO(encoded_image)) + + image.save(filename, format="JPEG") + + shutil.copy(filename, f".storage/{filename}") + + generated_description = add_images_descriptions(f".storage/{filename}") + print(generated_description) + return f"The image generated is: {generated_description} \n\n Saved in path: '.storage/{filename}'" + +def generate_image_variation(image_path, filename): + try: + file_input = open(image_path, 'rb') + output = replicate.run( + "black-forest-labs/flux-redux-dev", + input={ + "guidance": 3, + "megapixels": "1", + "num_outputs": 4, + "redux_image": file_input, + "aspect_ratio": "21:9", + "output_format": "webp", + "output_quality": 80, + "num_inference_steps": 28 + } + ) + file_input.close() + except Exception as e: + return (f"Couldn't generate that image. Please try a different prompt. -> Error: {e}") + encoded_image = output[0].read() + image = Image.open(BytesIO(encoded_image)) + + image.save(filename, format="JPEG") + + shutil.copy(filename, f".storage/{filename}") + + generated_description = add_images_descriptions(f".storage/{filename}") + print(generated_description) + return f"The image generated is: {generated_description} \n\n Saved in path: '.storage/{filename}'" + +########################### Descriptions ########################## + +image_generator_description = { + "type": "function", + "function": { + "name": "image_generator", + "description": "When the user requests an image, logo, or similar visual, this function generates a high-quality JPEG image based on a user-provided prompt. The user prompt can be enhanced to improve the image quality, but the core idea provided by the user will always needs to be preserved.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "An enhanced or more detailed version of the user's original prompt. This refined prompt retains the user's core idea but includes the necessary details or styles for the image generator to produce high-quality results.", + }, + "filename": { + "type": "string", + "description": "Choose a name for the generated image file (including file extension).", + }, + }, + "required": ["prompt", "filename"], + "additionalProperties": False, + }, + } +} + +image_analyzer_description = { + "type": "function", + "function": { + "name": "image_analyzer", + "description": "Analyzes the content of an image and provides insights or answers based on a specified prompt.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "image_path": { + "type": "string", + "description": "Path of the image to be analyzed." + }, + "prompt": { + "type": "string", + "description": "A query or instruction about what to analyze in the image. Defaults to 'What's in this image?' if not provided." + } + }, + "required": ["image_path", "prompt"], + "additionalProperties": False + } + } +} + +edition_canny_model_description = { + "type": "function", + "function": { + "name": "edition_canny_model", + "description": "Generates a new version of an input image using Canny edge detection for structural guidance and a user-provided prompt.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "image_path": { + "type": "string", + "description": "The file path of the input image to be processed using Canny edge detection." + }, + "modified_description": { + "type": "string", + "description": "Based on the history description image modify it with the desired transformation or features to be applied to the image." + }, + "filename": { + "type": "string", + "description": "The name (including extension) of the output image file." + } + }, + "required": ["image_path", "modified_description", "filename"], + "additionalProperties": False + } + } +} + +edition_depth_model_description = { + "type": "function", + "function": { + "name": "edition_depth_model", + "description": "Generates a new version of an input image using depth information as structural guidance and a user-provided prompt.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "image_path": { + "type": "string", + "description": "The file path of the input image to be processed using depth-based structural guidance." + }, + "modified_description": { + "type": "string", + "description": "Based on the history description image modify it with the desired transformation or features to be applied to the image." + }, + "filename": { + "type": "string", + "description": "The name (including extension) of the output image file." + } + }, + "required": ["image_path", "modified_description", "filename"], + "additionalProperties": False + } + } +} + +generate_image_variation_description = { + "type": "function", + "function": { + "name": "generate_image_variation", + "description": "Creates variations of an input image by mixing its elements with new aspects using user-defined parameters.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "image_path": { + "type": "string", + "description": "The file path of the input image to be used as a base for generating variations." + }, + "filename": { + "type": "string", + "description": "The name (including extension) of the output image file." + } + }, + "required": ["image_path", "filename"], + "additionalProperties": False + } + } +} + +images_management_system_description = { + "type": "function", + "function": { + "name": "images_management_system", + "description": "Manages and processes image-related queries, enabling tasks such as generating new images, editing existing ones, analyzing image content, and creating variations. Interacting with images based on user queries.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "user_query": { + "type": "string", + "description": "A query or instruction related to generating, editing, analyzing, or managing images." + } + }, + "required": ["user_query"], + "additionalProperties": False + } + } +} + +########################### Main Agent ############################ + +def images_management_system(user_query): + tools_descriptions = [image_generator_description, image_analyzer_description, edition_canny_model_description, edition_depth_model_description, generate_image_variation_description] + tools_functions = [image_generator, image_analyzer, edition_canny_model, edition_depth_model, generate_image_variation] + + # load messages + messages = load_messages(file_path="./.storage/.images_agent_messages.json") + image_descriptions = load_json_file("./.storage/.images_description.json") + formatted_descriptions = "\n".join( + [f"Path: {record['image_path']}, Description: {record['image_description']}" for record in image_descriptions] +) + + if len(messages) < 1: + messages = insert_message(messages, "system", f"""You are a specialized image management system designed to process, analyze, and enhance images based on user queries. Your task is to interact with a range of tools, + each tailored for specific image-related tasks, to generate insights, create visuals, and apply transformations. + +Available images are stored in various common formats and accessible via a path. Your actions should be precise and context-driven, ensuring user needs are met effectively. + +Effective Tool Usage Recommendations: +1. Use the **image generator** for creating new visuals based on detailed prompts, ensuring high-quality results. +2. Leverage **image analyzer** to interpret the content of images or to answer specific questions about their context. +3. For image transformations: + - Use **edition_canny_model** for structural edits guided by edge detection. + - Use **edition_depth_model** for modifications driven by depth-based structural guidance. +4. Use **generate_image_variation** to create variations of an input image when requested. +""") + messages = insert_message(messages, "user", f"Fullfill this request -> {user_query}, \n\n Current available images are:{formatted_descriptions}") + + response = generate_response("openai", "gpt-4-turbo", messages, tools_descriptions, tool_choice = "required", parallel_tool_calls=False) + tool_result = execute_function(response, tools_functions) + insert_tool_message(messages, response, tool_result) + save_messages(messages, file_path="./.storage/.text_agent_messages.json") + + return tool_result \ No newline at end of file diff --git a/apps/hal9/tools/image_analyzer.py b/apps/hal9/tools/image_analyzer.py deleted file mode 100644 index 79d3042e..00000000 --- a/apps/hal9/tools/image_analyzer.py +++ /dev/null @@ -1,33 +0,0 @@ -import openai -import os - -def image_analyzer(image_url, prompt): - """Use this tool when the user provides a URL with an image extension such as JPG or PNG. -Parameters: - 'image_url' = URL containing the image or photograph. - 'prompt' = description of what the user wants to analyze in the image. If the user does not specify, it should default to "What's in this image?" -""" - client = openai.AzureOpenAI( - azure_endpoint = 'https://openai-hal9.openai.azure.com/', - api_key = os.environ['OPENAI_AZURE'], - api_version = '2024-02-15-preview', - ) - - response = client.chat.completions.create( - model="gpt-4o", - messages=[ - { - "role": "user", - "content": [ - {"type": "text", "text": prompt}, - { - "type": "image_url", - "image_url" : {"url": image_url,}, - }, - ], - } - ], - ) - - print(response.choices[0].message.content) - return response.choices[0].message.content \ No newline at end of file diff --git a/apps/hal9/tools/streamlit.py b/apps/hal9/tools/streamlit.py index 4ed2e9bc..10741c5e 100644 --- a/apps/hal9/tools/streamlit.py +++ b/apps/hal9/tools/streamlit.py @@ -1,35 +1,98 @@ -import hal9 as h9 -import openai +from utils import generate_response, load_messages, insert_message, extract_code_block, save_messages import os -import json - -def build_streamlit(prompt): - """Use this tool when a user requests a Streamlit app or asks to modify a previously generated one. -Parameters: - 'prompt' = with user change or requirements - """ - client = openai.AzureOpenAI( - azure_endpoint = 'https://openai-hal9.openai.azure.com/', - api_key = os.environ['OPENAI_AZURE'], - api_version = '2023-05-15', - ) +import traceback +from clients import openai_client - system = """ -This is a Python streamlit generator system that automates the creation of Streamlit apps -based on user prompts. It interprets natural language queries, and the response is an interactive -Streamlit app. Do not add code or instructions to run the app or to install packages. You can use -Plotly for charts. You don't have access to CSV files unless the user provides an explicit URL. -""" - messages = h9.load("streamlit-messages", [{"role": "system", "content": system}]) - messages.append({"role": "user", "content": prompt}) +def fix_code(chat_input, error, complete_traceback, python_code): + stream = openai_client.chat.completions.create( + model = "gpt-4-turbo", + messages = [ + {"role": "user", "content": +f"""The following Python code needs to be fixed. It should create a interactive Streamlit app fulfill this user request: '{chat_input}', return the fixed code as fenced code block with triple backticks (```) as ```python``` + +### Error encountered: + +{error} + +### Code that needs fixing: + +{python_code} + +### Complete error traceback: + +{complete_traceback} +""" + },] + ) + return extract_code_block(stream.choices[0].message.content, "python") - completion = client.chat.completions.create(model = "gpt-4", messages = messages, stream = True) - response = h9.complete(completion, messages) +def debug_code(python_code): + try: + exec(python_code) + return "Streamlit app generated and running properly", "", "" + except Exception as e: + tb = traceback.format_exc() + relevant_error_info = tb.splitlines() + last_line = relevant_error_info[-1] + complete_traceback="\n".join(relevant_error_info) + return "App fails to run", last_line, complete_traceback + +def save_python_code(code): + directory = "./.storage/app.py" + # Ensure the target directory exists + if not os.path.exists(directory): + os.makedirs(directory) + + # Write the code to the app.py file + python_file_path = os.path.join(directory, "app.py") + with open(python_file_path, 'w') as file: + file.write(code) - code = h9.extract(response, language="py") +def streamlit_generator(prompt): + # load messages + messages = load_messages(file_path="./.storage/.streamlit_messages.json") - h9.save("streamlit-messages", messages, hidden=True) - h9.save("app.py", code) + if len(messages) < 1: + messages = insert_message(messages, "system", f"""This is a Python streamlit generator system that automates the creation of Streamlit apps based on user prompts. It interprets natural language queries, and the response is an complete python script with the including imports for a interactive Streamlit app, return the code as fenced code block with triple backticks (```) as ```python```""") + messages = insert_message(messages, "user", f"Generates an app that fullfills this user request -> {prompt}") + model_response = generate_response("openai", "gpt-4-turbo", messages) + response_content = model_response.choices[0].message.content + streamlit_code = extract_code_block(response_content, "python") + # Debug and fix the code if needed + max_tries = 3 + tries = 0 + while tries < max_tries: + result, error, complete_traceback = debug_code(streamlit_code) + if result == "Streamlit app generated and running properly": + save_python_code(streamlit_code) + messages = insert_message(messages, "assistant", streamlit_code) + save_messages(messages, file_path="./.storage/.streamlit_messages.json") + print(result) + return result + else: + streamlit_code = fix_code(prompt, error, complete_traceback, streamlit_code) + tries += 1 - return response + print("Unable to generate an app that fulfills your request without errors.") + return "Unable to generate an app that fulfills your request without errors." + +streamlit_generator_description = { + "type": "function", + "function": { + "name": "streamlit_generator", + "description": "Generates a complete Python Streamlit app based on user-provided natural language prompts. It automates the creation of interactive applications that requires interactions (This tool do not interact with files uploaded)", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "A detailed natural language description of the desired Streamlit app, including specific requirements or features to implement.", + } + }, + "required": ["prompt"], + "additionalProperties": False, + }, + } +} \ No newline at end of file diff --git a/apps/hal9/tools/text_agent.py b/apps/hal9/tools/text_agent.py new file mode 100644 index 00000000..44872979 --- /dev/null +++ b/apps/hal9/tools/text_agent.py @@ -0,0 +1,338 @@ +from utils import stream_print +import pandas as pd +from utils import generate_response, load_messages, insert_message, execute_function, save_messages, insert_tool_message, generate_embeddings +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +import os + +########################### Functions ########################## + +def retrieve_chunks_from_page_number(page_number, file_to_filter=None): + # Convert `page_number` from string to integer + page_number = int(page_number) + + # Load the Parquet file into a DataFrame + df = pd.read_parquet("./.storage/.text_files.parquet") + + # Optionally filter by file + if file_to_filter: + df = df[df['filename'] == file_to_filter] + + # Filter by page number + df = df[df['page'] == page_number] + + # Remove the 'embedding' column + df = df.drop(columns=['embedding'], errors='ignore') + + # Return the results as a list of dictionaries + return df.to_dict(orient='records') + +def retrieve_chunks_containing_word(word, file_to_filter=None): + # Load the Parquet file into a DataFrame + df = pd.read_parquet("./.storage/.text_files.parquet") + + # Optionally filter by file + if file_to_filter: + df = df[df['filename'] == file_to_filter] + + # Filter for chunks containing the word (case-insensitive search) + df = df[df['text'].str.contains(word, case=False, na=False)] + + # Remove the 'embedding' column + df = df.drop(columns=['embedding'], errors='ignore') + + # Return the results as a list of dictionaries + return df.to_dict(orient='records') + +def retrieve_chunks_by_index(chunk_ids, file_to_filter=None): + # Check if chunk_ids is a string and contains commas (i.e., not a list already) + if isinstance(chunk_ids, str): + # Try to parse it as a comma-separated string + chunk_ids = chunk_ids.split(',') + # Convert each item to an integer, assuming they represent chunk IDs + chunk_ids = [int(id) for id in chunk_ids] + + # Ensure chunk_ids is a list + if not isinstance(chunk_ids, list): + chunk_ids = [chunk_ids] + + # Load the Parquet file into a DataFrame + df = pd.read_parquet("./.storage/.text_files.parquet") + + # Optionally filter by file + if file_to_filter: + df = df[df['filename'] == file_to_filter] + + # Ensure chunk_ids is a list + if not isinstance(chunk_ids, list): + chunk_ids = [chunk_ids] + + # Filter by chunk IDs + df = df[df['chunk_id'].isin(chunk_ids)] + + # Remove the 'embedding' column + df = df.drop(columns=['embedding'], errors='ignore') + + # Return the results as a list of dictionaries + return df.to_dict(orient='records') + +def similarity_search(input_text, top_n="5", file_to_filter=None): + # Generate the embedding from the input text + query_embedding = generate_embeddings(text=input_text, model="text-embedding-3-small", client_type="azure") + + # Convert `top_n` from string to integer + top_n = int(top_n) + + # Load the Parquet file into a DataFrame + df = pd.read_parquet("./.storage/.text_files.parquet") + + # Optionally filter by file + if file_to_filter and file_to_filter != "None": + df = df[df['filename'] == file_to_filter] + + # Ensure the embeddings column exists + if 'embedding' not in df.columns: + raise ValueError("Embeddings not found in the dataset.") + + # Calculate cosine similarity between query_embedding and all embeddings + df['similarity'] = df['embedding'].apply( + lambda x: cosine_similarity([query_embedding], [np.array(x)])[0][0] + ) + + # Sort by similarity and take the top N + top_matches = df.nlargest(top_n, 'similarity') + + # Drop unnecessary columns and return the result + top_matches = top_matches.drop(columns=['embedding', 'similarity'], errors='ignore') + return top_matches.to_dict(orient='records') + +def random_pick_chunks(num_chunks, file_to_filter=None): + # Convert num_chunks to an integer + num_chunks = int(num_chunks) + # Load the Parquet file into a DataFrame + df = pd.read_parquet("./.storage/.text_files.parquet") + + # Optionally filter by specific file if 'file_to_filter' is provided + if file_to_filter or file_to_filter != "None": + df = df[df['filename'] == file_to_filter] + + # Ensure that num_chunks does not exceed the available chunks in the filtered DataFrame + if num_chunks > len(df): + return df.drop(columns=['embedding']).to_dict(orient='records') + + # Randomly select 'num_chunks' from the filtered DataFrame + selected_chunks = df.sample(n=num_chunks) + + # Remove the 'embedding' column, as we don't need it in the response + selected_chunks = selected_chunks.drop(columns=['embedding']) + + # Return the selected chunks along with all relevant metadata + return selected_chunks.to_dict(orient='records') + +def final_response(final_message): + stream_print(final_message) + return final_message + +########################### Descriptions ########################## + + +analyze_text_file_description = { + "type": "function", + "function": { + "name": "analyze_text_file", + "description": "Analyzes uploaded text files, answering questions, searching for specific words, summarizing content, and referencing pages for enhanced understanding.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "user_query": { + "type": "string", + "description": "A question or query related to the CSV data.", + }, + }, + "required": ["user_query"], + "additionalProperties": False, + }, + } +} + +random_pick_chunks_description = { + "type": "function", + "function": { + "name": "random_pick_chunks", + "description": "Randomly selects chunks of text from the vector database, providing an overview of the document. This function can be used to quickly gather a general understanding of the content by picking a random subset of text chunks.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "num_chunks": { + "type": "string", + "description": "The number of random chunks to retrieve from the vector database (Consider to retrieve at least 10).", + }, + "file_to_filter": { + "type": "string", + "description": "Optionally, specify the name of a file from 'available files' to filter chunks. Provide 'None' to include all files.", + } + }, + "required": ["num_chunks", "file_to_filter"], + "additionalProperties": False + } + } +} + +retrieve_chunks_from_page_number_description = { + "type": "function", + "function": { + "name": "retrieve_chunks_from_page_number", + "description": "Fetches all text chunks corresponding to a specific page number from the vector database. This is useful for retrieving information associated with a particular page.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "page_number": { + "type": "string", + "description": "The page number to retrieve chunks for (e.g., '3').", + }, + "file_to_filter": { + "type": "string", + "description": "Optionally, specify the name of a file from 'available files' to filter chunks. Provide 'None' to include all files.", + } + }, + "required": ["page_number", "file_to_filter"], + "additionalProperties": False, + }, + } +} + +retrieve_chunks_containing_word_description = { + "type": "function", + "function": { + "name": "retrieve_chunks_containing_word", + "description": "Searches the vector database for text chunks containing a specific word or phrase. Useful for targeted keyword searches in the documents.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "word": { + "type": "string", + "description": "The word or phrase to search for.", + }, + "file_to_filter": { + "type": "string", + "description": "Optionally, specify the name of a file from 'available files' to filter chunks. Provide 'None' to include all files.", + } + }, + "required": ["word", "file_to_filter"], + "additionalProperties": False, + }, + } +} + +retrieve_chunks_by_index_description = { + "type": "function", + "function": { + "name": "retrieve_chunks_by_index", + "description": "Retrieves text chunks based on their unique chunk IDs. Use this function to fetch specific chunks identified by their index.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "chunk_ids": { + "type": "string", + "description": "A list of unique chunk IDs to retrieve. Provide this as a comma-separated list of integers in string format (e.g., '1,2,3').", + }, + "file_to_filter": { + "type": "string", + "description": "Optionally, specify the name of a file from 'available files' to filter chunks. Provide 'None' to include all files.", + } + }, + "required": ["chunk_ids", "file_to_filter"], + "additionalProperties": False, + }, + } +} + +similarity_search_description = { + "type": "function", + "function": { + "name": "similarity_search", + "description": "Performs a similarity search using a word or phrase input to find the most relevant text chunks. This function helps locate text similar to a specific query or concept.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "input_text": { + "type": "string", + "description": "A text input to be converted into an embedding vector.", + }, + "top_n": { + "type": "string", + "description": "The number of top matching chunks to retrieve. (Consider to retrieve at least 5)", + }, + "file_to_filter": { + "type": "string", + "description": "Optionally, specify the name of a file from 'available files' to filter chunks. Provide 'None' to include all files.", + } + }, + "required": ["input_text", "top_n", "file_to_filter"], + "additionalProperties": False, + }, + } +} + +final_response_description = { + "type": "function", + "function": { + "name": "final_response", + "description": "This function is called when all necessary information has been gathered through the tools, and the response is ready to be sent to the user.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "final_message": { + "type": "string", + "description": "This response describes the process of gathering information. It explains the tools used to retrieve the data. The response then provides a comprehensive and detailed answer to the user's query, referencing specific pages or filenames where the information was sourced. Finally, it concludes with follow-up questions to guide the user toward deeper insights or further exploration of the topic.", + }, + }, + "required": ["final_message"], + "additionalProperties": False, + }, + } +} + +########################### Main Agent ############################ + +def analyze_text_file(user_query): + tools_descriptions = [final_response_description, random_pick_chunks_description, similarity_search_description, retrieve_chunks_by_index_description, retrieve_chunks_containing_word_description, retrieve_chunks_from_page_number_description] + tools_functions = [final_response, random_pick_chunks, similarity_search, retrieve_chunks_by_index, retrieve_chunks_containing_word, retrieve_chunks_from_page_number] + + # load messages + messages = load_messages(file_path="./.storage/.text_agent_messages.json") + available_text_files = [f for f in os.listdir('./.storage') if f.endswith(('.pdf', '.txt'))] + if len(messages) < 1: + messages = insert_message(messages, "system", f"""You are a specialized text analysis system designed to analyze and extract insights from text files, such as PDFs. Your task is to extract information step-by-step using a range of tools, each tailored for specific types of queries. + + The text from all 'available files' has been split into 300-word chunks, stored in a vector database. Each chunk has a unique `chunk_id` and is ordered sequentially. + + Effective Tool Usage Recommendations: + 1. Start random sampling for a high-level overview of content or to identify starting points for deeper analysis. + 2. Use the most specific tool (e.g., word search or page retrieval) to gather targeted results. + 3. Use similarity search when exact matches are unavailable or context needs to be inferred. + 4. Use chunk retrieval by ID or page to access additional context or subsequent lines when relevant information is found but incomplete. + + Once sufficient information has been gathered, Generate a response that first explains the process and tools used to retrieve the informationm then provide a long, comprehensive and detailed answer to the user's input, its neccesary to reference the specific pages numbers where the information was retrieved(Consider to mention filenames in case that you used multiple files). Conclude with relevant follow-up questions to guide the user toward deeper insights or further exploration of the topic.""") + messages = insert_message(messages, "user", f"Answer this request -> {user_query} , considering that this are your 'available files': {available_text_files}") + + steps = 0 + max_steps = 10 + while steps < max_steps: + response = generate_response("openai", "gpt-4-turbo", messages, tools_descriptions, tool_choice = "required", parallel_tool_calls=False) + tool_result = execute_function(response, tools_functions) + insert_tool_message(messages, response, tool_result) + save_messages(messages, file_path="./.storage/.text_agent_messages.json") + response_message = response.choices[0].message + tool_calls = getattr(response_message, 'tool_calls', None) + if tool_calls[0].function.name == "final_response": + return tool_result + + return "I was unable to find a solution in time" \ No newline at end of file diff --git a/apps/hal9/tools/website.py b/apps/hal9/tools/website.py index b897de7f..043bab3f 100644 --- a/apps/hal9/tools/website.py +++ b/apps/hal9/tools/website.py @@ -1,9 +1,8 @@ import hal9 as h9 import openai import os -import json -def build_website(prompt): +def website_generator(prompt): """ Builds or modifies a website based on user description or a change request 'prompt' with user change or requirements @@ -40,3 +39,23 @@ def build_website(prompt): summary = h9.complete(completion, messages, show = False) print(summary) return summary + +website_generator_description = { + "type": "function", + "function": { + "name": "website_generator", + "description": "This function creates or modifies a website based on a user's description or change requests. It dynamically generates HTML, CSS, JavaScript, and other web assets as specified in the input prompt. The function maintains a stateful interaction, allowing for iterative website building and modification.", + "strict": True, + "parameters": { + "type": "object", + "properties": { + "prompt": { + "type": "string", + "description": "A user-provided description of the website requirements or specific modification requests.", + }, + }, + "required": ["prompt"], + "additionalProperties": False, + }, + }, +} \ No newline at end of file diff --git a/apps/hal9/utils.py b/apps/hal9/utils.py new file mode 100644 index 00000000..c51a1f04 --- /dev/null +++ b/apps/hal9/utils.py @@ -0,0 +1,329 @@ +import json +import os +import urllib.parse +import urllib.request +import requests +import time +from typing import Literal, List, Dict, Any, Union, Optional +from clients import openai_client, azure_openai_client +from groq import Groq +from openai import AzureOpenAI, OpenAI +import fitz +from io import BytesIO +import pandas as pd +from concurrent.futures import ThreadPoolExecutor +import ast +import re + +# Define the allowed client types. +ClientType = Literal["openai", "azure", "groq"] + +def get_client(client_type: ClientType) -> Union[OpenAI, AzureOpenAI, Groq]: + """ + Returns the appropriate client instance based on the given type. + + Parameters: + client_type (ClientType): The type of client ("openai", "azure", "groq"). + + Returns: + Union[openai_client, azure_openai_client, Groq]: An instance of the selected client. + + Raises: + ValueError: If the provided client type is not supported. + """ + if client_type == "openai": + return openai_client + elif client_type == "azure": + return azure_openai_client + elif client_type == "groq": + return Groq() + else: + raise ValueError(f"Unsupported client type: {client_type}") + +def generate_response( + client_type: ClientType, + model: str, + messages: List[Dict[str, Any]], + tools: Optional[List] = None, + tool_choice: Optional[str] = None, + parallel_tool_calls: Optional[bool] = True, + temperature: Optional[float] = None, + seed: Optional[int] = None, + top_p: Optional[float] = None, + frequency_penalty: Optional[float] = None, + max_completion_tokens: Optional[int] = None, + n: int = 1 +) -> Dict[str, Any]: + """ + Generates a response using the appropriate client based on the specified type. + + Parameters: + client_type (ClientType): The type of client ("openai", "azure", "groq"). + model (str): The model to use for generating the response. + messages (List[Dict[str, Any]]): List of messages to provide as context. + tools (Optional[List]): Available tools for the model. Default is None. + tool_choice (Optional[str]): The selected tool to use. Default is None. + temperature (Optional[float]): Controls randomness in the output (0 to 1). Default is None. + seed (Optional[int]): Seed for reproducible randomness. Default is None. + top_p (Optional[float]): Probability mass for nucleus sampling. Default is None. + frequency_penalty (Optional[float]): Penalizes repetition. Default is None. + max_completion_tokens (Optional[int]): Max tokens for the response. Default is None. + n (int): Number of responses to generate. Default is 1. + + Returns: + Dict[str, Any]: The response generated by the selected client. + """ + # Get the appropriate client instance. + client = get_client(client_type) + + # Prepare the payload dynamically. + payload = { + "model": model, + "messages": messages, + "tools": tools, + "tool_choice": tool_choice, + "temperature": temperature, + "seed": seed, + "top_p": top_p, + "frequency_penalty": frequency_penalty, + "max_tokens": max_completion_tokens, + "n": n + } + + if tools is not None: + payload["parallel_tool_calls"] = parallel_tool_calls + + # Generate the response using the client's completion API. + response = client.chat.completions.create(**payload) + + return response + +def load_messages(file_path="./.storage/.messages.json") -> List[Dict[str, Any]]: + """ + Loads messages from a JSON file located in the './.storage' directory. + + Returns: + List[Dict[str, Any]]: A list of messages if the file exists and is valid. + """ + # Create the .storage directory if it doesn't exist + if not os.path.exists("./.storage"): + os.makedirs("./.storage") + if not os.path.exists(file_path): + return [] + else : + with open(file_path, "r", encoding="utf-8") as file: + messages = json.load(file) + + return messages + +def save_messages(messages: List[Dict[str, Any]], file_path="./.storage/.messages.json") -> None: + """ + Saves messages to a JSON file located in the './.storage' directory. + + Args: + messages (List[Dict[str, Any]]): A list of messages to be saved. + """ + + # Create the .storage directory if it doesn't exist + if not os.path.exists("./.storage"): + os.makedirs("./.storage") + with open(file_path, "w", encoding="utf-8") as file: + json.dump(messages, file, ensure_ascii=False, indent=4) + +def insert_message(messages , role, content, tool_call_id=None): + if tool_call_id: + return None + else: + messages.append({"role": role, "content": content}) + return messages + +def execute_function(model_response, functions): + # Extract the message from the response. + try: + response_message = model_response.choices[0].message + except (IndexError, AttributeError) as e: + print(f"Error extracting message from model response: {e}") + return + + # Access the tool calls (if any) from the message. + tool_calls = getattr(response_message, 'tool_calls', None) + + if not tool_calls: + print("No tool calls found.") + return + + # Iterate over the tool calls and extract relevant information. + for tool_call in tool_calls: + function_name = tool_call.function.name + try: + arguments = ast.literal_eval(tool_call.function.arguments) + except AttributeError as e: + print(f"Error accessing arguments: {e}") + continue + # Convert arguments into a string format for logging or execution. + args_str = ', '.join(f"{k}={repr(v)}" for k, v in arguments.items()) + + # Add all the functions into the exec context + context = {} + for func in functions: + context[func.__name__] = func + + # Prepare the code string to execute + code_to_exec = f"result = {function_name}({args_str})" + + # Execute the code with exec(), but ensure proper error handling. + try: + exec(code_to_exec, context) + return context['result'] + except Exception as e: + print(f"Error executing function '{function_name}': {e}") + raise + +def stream_print(text: str): + for char in text: + print(char, end="", flush=True) + time.sleep(0.02) + +def insert_tool_message(messages, model_response, tool_result): + tool_calls = model_response.choices[0].message.tool_calls + + if tool_calls: + for tool_call in tool_calls: + messages.append({ + "role": "assistant", + "tool_calls": [{ + "id": tool_call.id, + "type": "function", + "function": { + "arguments": tool_call.function.arguments, + "name": tool_call.function.name, + }, + }] + }) + function_args = json.loads(tool_call.function.arguments, strict=False) + + tool_content = json.dumps({**function_args, "response": str(tool_result)}) + + messages.append({ + "role": "tool", + "content": tool_content, + "tool_call_id": tool_call.id + }) + +def is_url(prompt): + result = urllib.parse.urlparse(prompt) + return all([result.scheme, result.netloc]) + +def download_file(url): + # Create the .storage directory if it doesn't exist + if not os.path.exists("./.storage"): + os.makedirs("./.storage") + filename = url.split("/")[-1] + modified_filename = f"./.storage/.{filename}" + + response = requests.get(url) + + if response.status_code == 200: + with open(modified_filename, 'wb') as file: + file.write(response.content) + else: + print(f"Failed to download the file. Status code: {response.status_code}") + +def generate_embeddings(text, model, client_type): + client = get_client(client_type) + response = client.embeddings.create( + input=text, + model=model) + + return response.data[0].embedding + +def split_text(text, n_words=300, overlap=0): + """ + Splits a text into chunks of `n_words` words with an overlap of `overlap` words. + + Args: + text (str): The input text to be split. + n_words (int): Number of words per chunk. + overlap (int): Number of overlapping words between consecutive chunks. + + Returns: + list: A list of text chunks. + """ + # Validate inputs + if overlap >= n_words: + raise ValueError("Overlap must be smaller than the number of words per chunk.") + + # Split the text into words + words = text.split() + chunks = [] + + # Generate the chunks + start = 0 + while start < len(words): + end = start + n_words + chunk = words[start:end] + chunks.append(" ".join(chunk)) + + # Move the start point forward, with overlap + start += n_words - overlap + + return chunks + +def process_chunk(chunk_info): + chunk, page_num, model, client_type = chunk_info + embedding = generate_embeddings(chunk, model=model, client_type=client_type) + return { + "text": chunk, + "embedding": embedding, + "page": page_num + 1 # Page numbers start from 1 + } + +def generate_text_embeddings_parquet(url, model="text-embedding-3-small", client_type="azure", n_words=300, overlap=0, max_threads=8): + # Create the .storage directory if it doesn't exist + if not os.path.exists("./.storage"): + os.makedirs("./.storage") + # Download and read the PDF + response = requests.get(url) + pdf_document = fitz.open(stream=BytesIO(response.content)) + + # Prepare chunk info for parallel processing + chunk_info_list = [] + for page_num in range(len(pdf_document)): + page = pdf_document[page_num] + page_text = page.get_text() + + # Split the page text into chunks + text_chunks = split_text(page_text, n_words=n_words, overlap=overlap) + + # Add chunk info to the list + for chunk in text_chunks: + chunk_info_list.append((chunk, page_num, model, client_type)) + + pdf_document.close() + + # Process chunks in parallel + rows = [] + with ThreadPoolExecutor(max_threads) as executor: + for result in executor.map(process_chunk, chunk_info_list): + rows.append(result) + + # Create the DataFrame + df = pd.DataFrame(rows) + + # Add a global chunk ID column + df['chunk_id'] = range(len(df)) + df['filename'] = '.' + url.split("/")[-1] + + # Save as Parquet + df.to_parquet("./.storage/.text_files.parquet", engine="pyarrow", index=False) + +def load_json_file(json_path): + if os.path.exists(json_path): + with open(json_path, 'r') as file: + return json.load(file) + return [] + +def extract_code_block(code: str, language: str) -> str: + pattern = rf"```{language}\n(.*?)```" + match = re.search(pattern, code, re.DOTALL) + return match.group(1) if match else "" \ No newline at end of file