agituts · teebu · Dec 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
 __pycache__/
 *.pyc
-.env
+.env
+venv
+.venv
+podcast_script.txt
+final_podcast.wav
+tmp*/
diff --git a/generate_audio.py b/generate_audio.py
@@ -6,11 +6,13 @@
 from audio_processor import AudioGenerator
 from dotenv import load_dotenv
 from pydub import AudioSegment
+import re
 
 load_dotenv()
 
 VOICE_A = os.getenv('VOICE_A', 'Puck')
 VOICE_B = os.getenv('VOICE_B', 'Kore')
+VOICE_C = os.getenv('VOICE_C', 'Charon')
 
 def parse_conversation(file_path):
     with open(file_path, 'r', encoding='utf-8') as file:
@@ -19,15 +21,18 @@ def parse_conversation(file_path):
     lines = content.strip().split('\n')
     speaker_a_lines = []
     speaker_b_lines = []
+    speaker_c_lines = []
 
-    for line in lines:
+    for index, line in enumerate(lines, start=0):
         if line.strip():
             if line.startswith("Speaker A:"):
-                speaker_a_lines.append(line.replace("Speaker A:", "").strip())
+                speaker_a_lines.append(line.replace("Speaker A:", f"{index}|").strip())
             elif line.startswith("Speaker B:"):
-                speaker_b_lines.append(line.replace("Speaker B:", "").strip())
+                speaker_b_lines.append(line.replace("Speaker B:", f"{index}|").strip())
+            elif line.startswith("Speaker C:"):
+                speaker_c_lines.append(line.replace("Speaker C:", f"{index}|").strip())
 
-    return speaker_a_lines, speaker_b_lines
+    return speaker_a_lines, speaker_b_lines, speaker_c_lines
 
 def read_file_content(file_path):
     with open(file_path, 'r', encoding='utf-8') as file:
@@ -40,19 +45,26 @@ async def setup_environment():
 def read_and_parse_inputs():
     system_instructions = read_file_content('system_instructions_audio.txt')
     full_script = read_file_content('podcast_script.txt')
-    speaker_a_lines, speaker_b_lines = parse_conversation('podcast_script.txt')
-    return system_instructions, full_script, speaker_a_lines, speaker_b_lines
+    speaker_a_lines, speaker_b_lines, speaker_c_lines = parse_conversation('podcast_script.txt')
+    return system_instructions, full_script, speaker_a_lines, speaker_b_lines, speaker_c_lines
 
 def prepare_speaker_dialogues(system_instructions, full_script, speaker_lines, voice, temp_dir):
     dialogues = [system_instructions + "\n\n" + full_script]
     output_files = [os.path.join(temp_dir, f"speaker_{voice}_initial.wav")]
 
     for i, line in enumerate(speaker_lines):
-        dialogues.append(line)
-        output_files.append(os.path.join(temp_dir, f"speaker_{voice}_{i}.wav"))
+        line_num, line_dialog = get_line_number(line)
+        dialogues.append(line_dialog)
+        output_files.append(os.path.join(temp_dir, f"{line_num}_speaker_{voice}.wav"))
 
     return dialogues, output_files
 
+def get_line_number(line):
+    match = re.match(r"(\d+)\|(.*)", line)
+    if match:
+        return int(match.group(1)), match.group(2).strip()
+    return None, line
+
 async def process_speaker(voice, dialogues, output_files):
     # Create a single generator for all dialogues
     generator = AudioGenerator(voice)
@@ -64,20 +76,17 @@ async def process_speaker(voice, dialogues, output_files):
     if generator.ws:
         await generator.ws.close()
 
-def interleave_output_files(speaker_a_files, speaker_b_files):
-    """Interleaves the audio files from both speakers to maintain conversation order"""
-    all_output_files = []
-    min_length = min(len(speaker_a_files), len(speaker_b_files))
-
-    # Interleave files from both speakers
-    for i in range(min_length):
-        all_output_files.extend([speaker_a_files[i], speaker_b_files[i]])
-
-    # Add any remaining files from either speaker
-    all_output_files.extend(speaker_a_files[min_length:])
-    all_output_files.extend(speaker_b_files[min_length:])
+def extract_line_num(filename):
+    match = re.search(r"(\d+)_speaker_.*\.wav", filename)
+    if match:
+        return int(match.group(1))
+    return float('inf')  # Return a large number if no match is found
 
-    return all_output_files
+def interleave_output_files(speaker_a_files, speaker_b_files, speaker_c_files):
+    """Interleaves the audio files from all speakers to maintain conversation order"""
+    all_files = speaker_a_files + speaker_b_files + speaker_c_files
+    all_files.sort(key=extract_line_num)
+    return all_files
 
 def combine_audio_files(file_list, output_file, silence_duration_ms=50):
     combined = AudioSegment.empty()
@@ -95,13 +104,15 @@ async def main():
     script_dir = await setup_environment()
 
     with tempfile.TemporaryDirectory(dir=script_dir) as temp_dir:
-        system_instructions, full_script, speaker_a_lines, speaker_b_lines = read_and_parse_inputs()
+        system_instructions, full_script, speaker_a_lines, speaker_b_lines, speaker_c_lines = read_and_parse_inputs()
 
         # Prepare dialogues for both speakers
         dialogues_a, output_files_a = prepare_speaker_dialogues(
             system_instructions, full_script, speaker_a_lines, VOICE_A, temp_dir)
         dialogues_b, output_files_b = prepare_speaker_dialogues(
             system_instructions, full_script, speaker_b_lines, VOICE_B, temp_dir)
+        dialogues_c, output_files_c = prepare_speaker_dialogues(
+            system_instructions, full_script, speaker_c_lines, VOICE_C, temp_dir)
 
         # Process Speaker A first
         print("Processing Speaker A...")
@@ -111,8 +122,13 @@ async def main():
         print("Processing Speaker B...")
         await process_speaker(VOICE_B, dialogues_b, output_files_b)
 
+        # Then process Speaker C
+        print("Processing Speaker C...")
+        await process_speaker(VOICE_C, dialogues_c, output_files_c)
+
+
         # Interleave and combine audio as before
-        all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:])
+        all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:], output_files_c[1:])
         final_output = "final_podcast.wav"
         combine_audio_files(all_output_files, final_output, silence_duration_ms=50)
         print(f"\nFinal podcast audio created: {final_output}")

diff --git a/generate_script.py b/generate_script.py
@@ -132,7 +132,7 @@ def create_podcast_script(content):
 
 def clean_podcast_script(script):
     # Define a regex pattern to identify the start of the podcast text
-    podcast_start_pattern = r"^(Speaker A:|Speaker B:)"
+    podcast_start_pattern = r"^(Speaker A:|Speaker B:|Speaker C:)"
 
     # Split the script into lines
     lines = script.splitlines()

diff --git a/system_instructions_script.txt b/system_instructions_script.txt
@@ -4,6 +4,7 @@
 - Output must ONLY contain dialogue in the following format:
 Speaker A: [dialogue text]
 Speaker B: [dialogue text]
+Speaker C: [dialogue text]
 
 - No other formatting, headers, or content should be included
 - No blank lines between speaker turns
@@ -13,7 +14,8 @@ Speaker B: [dialogue text]
 - Speaker labels must be exactly "Speaker A:" and "Speaker B:"
 
 <!-- Core Guidelines for Engaging Podcast -->
-- Create fun conversations between two hosts (Speaker A and B) with distinct personalities
+- Create fun conversations between three hosts (mostly Speaker A and B with C adding colorful commentary from time to time) with distinct personalities
+- Speaker C does not have to speak every turn
 - Include casual banter and appropriate humor to keep tone light
 - Use storytelling techniques with examples and real-world scenarios
 - Add occasional playful disagreements or friendly debates
@@ -31,18 +33,44 @@ Speaker B: [dialogue text]
 <!-- Speaker Format -->
 Speaker A: [dialogue]
 Speaker B: [dialogue]
+Speaker C: [dialogue]
 
 <!-- Speaker Characteristics -->
-Speaker A:
-- Inquisitive and curious
-- Asks clarifying questions
-- Drives conversation forward
+Speaker A (Host):
+- **Inquisitive and Curious**: Constantly seeks to understand the latest advancements in technology and innovation, often expressing excitement about emerging trends.
+- **Asks Clarifying Questions**: Engages actively in conversations by asking thoughtful and probing questions that encourage deeper discussion and ensure clarity.
+- **Drives Conversation Forward**: Acts as a catalyst in discussions, steering the conversation towards new topics and encouraging others to share their insights.
+- **Enthusiastic and Passionate**: Exhibits a genuine passion for technology and science, which is infectious and encourages others to engage in the conversation.
+- **Open-Minded**: Welcomes diverse perspectives and is willing to explore ideas that may differ from their own, fostering a collaborative atmosphere.
+- **Adaptable**: Easily adjusts to the flow of the conversation, ready to pivot to new subjects or angles as discussions evolve.
+- **Empathetic Listener**: Demonstrates active listening skills, showing understanding and consideration for others' viewpoints, which builds rapport.
+- **Analytical Thinker**: Approaches topics with a critical eye, breaking down information to understand underlying principles and implications.
+- **Engaging Storyteller**: Uses anecdotes and personal experiences to illustrate points, making the conversation relatable and memorable.
+- **Motivational**: Encourages others to share their knowledge and experiences, creating a positive environment that promotes learning and growth.
 
-Speaker B:
-- Explanatory and insightful
-- Builds on discussion points
-- Provides detailed responses
-- Uses analogies to simplify and clarify complex concepts
+Speaker B (Host):
+- **Explanatory and Insightful**: Provides comprehensive and well-thought-out explanations, drawing from a rich background in technology and its implications.
+- **Builds on Discussion Points**: Connects ideas and themes introduced by others, enhancing the depth of the conversation and creating a cohesive narrative.
+- **Provides Detailed Responses**: Delivers in-depth answers that address various aspects of a question, ensuring that no critical information is overlooked.
+- **Uses Analogies to Simplify and Clarify Complex Concepts**: Breaks down intricate ideas into relatable examples, making complex subjects more approachable.
+- **Patient and Thorough**: Takes the time to ensure that explanations are clear and comprehensive, accommodating different levels of understanding among listeners.
+- **Knowledgeable**: Possesses a deep understanding of technology, science, and media, which informs their contributions and enriches the discussion.
+- **Critical Thinker**: Analyzes information thoughtfully, weighing pros and cons to provide balanced viewpoints on issues.
+- **Communicative**: Clearly articulates thoughts and ideas in a way that is engaging and easy to follow, facilitating understanding.
+- **Supportive**: Encourages questions and discussions, making others feel comfortable sharing their thoughts and concerns without judgment.
+- **Visionary**: Offers forward-thinking perspectives on technology and its potential impact on society, inspiring others to think about the future of innovation.
+
+Speaker C (Comedian):
+- **Witty and Observational**: Known for sharp humor and keen observations about everyday life, making relatable comments that resonate with audiences.
+- **Playful and Light-hearted**: Approaches conversations with a sense of fun, infusing humor into discussions to keep things engaging and enjoyable.
+- **Master of Timing**: Understands when to deliver punchlines or comedic insights, effectively using pauses and pacing to enhance humor.
+- **Self-deprecating**: Often uses their own experiences and flaws as comedic material, creating a relatable persona that audiences can connect with.
+- **Clever and Quick-thinking**: Responds with humor in real-time, able to think on their feet and turn any topic into a comedic opportunity.
+- **Storyteller**: Crafts narratives around everyday situations, using humor to highlight the absurdities of life and make points more memorable.
+- **Observant and Detail-oriented**: Notices small details in conversations and daily life that others might overlook, using them as fodder for jokes.
+- **Engaging Performer**: Uses body language, facial expressions, and vocal variety to enhance their comedic delivery, making performances dynamic.
+- **Non-confrontational**: Keeps conversations light and avoids heavy topics unless they can be presented in a humorous way, ensuring a positive atmosphere.
+- **Relatable**: Builds rapport with audiences by discussing common experiences and shared frustrations, making them feel understood and included in the humor.
 
 <!-- Natural Speech Elements -->
 1. Strategic Pause Points:
@@ -74,12 +102,12 @@ Speaker B:
 
 <!-- Technical Guidelines -->
 - Output format must be plain text only
-- Each line must start with either "Speaker A:" or "Speaker B:"
+- Each line must start with either "Speaker A:" or "Speaker B:" or "Speaker C:"
 - No empty lines or additional formatting
 - No music references
 - Accept content from various formats (PDF, URL, text, Markdown)
 - Introduce technical terms naturally
-- Maintain 30-40 exchanges
+- Maintain 40-50 exchanges
 - Keep responses to 2-3 sentences per turn
 
 <!-- Quality Control -->