Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added support for 3 or more speakers and enhanced personalities #4

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
__pycache__/
*.pyc
.env
.env
venv
.venv
podcast_script.txt
final_podcast.wav
tmp*/
62 changes: 39 additions & 23 deletions generate_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
from audio_processor import AudioGenerator
from dotenv import load_dotenv
from pydub import AudioSegment
import re

load_dotenv()

VOICE_A = os.getenv('VOICE_A', 'Puck')
VOICE_B = os.getenv('VOICE_B', 'Kore')
VOICE_C = os.getenv('VOICE_C', 'Charon')

def parse_conversation(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
Expand All @@ -19,15 +21,18 @@ def parse_conversation(file_path):
lines = content.strip().split('\n')
speaker_a_lines = []
speaker_b_lines = []
speaker_c_lines = []

for line in lines:
for index, line in enumerate(lines, start=0):
if line.strip():
if line.startswith("Speaker A:"):
speaker_a_lines.append(line.replace("Speaker A:", "").strip())
speaker_a_lines.append(line.replace("Speaker A:", f"{index}|").strip())
elif line.startswith("Speaker B:"):
speaker_b_lines.append(line.replace("Speaker B:", "").strip())
speaker_b_lines.append(line.replace("Speaker B:", f"{index}|").strip())
elif line.startswith("Speaker C:"):
speaker_c_lines.append(line.replace("Speaker C:", f"{index}|").strip())

return speaker_a_lines, speaker_b_lines
return speaker_a_lines, speaker_b_lines, speaker_c_lines

def read_file_content(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
Expand All @@ -40,19 +45,26 @@ async def setup_environment():
def read_and_parse_inputs():
system_instructions = read_file_content('system_instructions_audio.txt')
full_script = read_file_content('podcast_script.txt')
speaker_a_lines, speaker_b_lines = parse_conversation('podcast_script.txt')
return system_instructions, full_script, speaker_a_lines, speaker_b_lines
speaker_a_lines, speaker_b_lines, speaker_c_lines = parse_conversation('podcast_script.txt')
return system_instructions, full_script, speaker_a_lines, speaker_b_lines, speaker_c_lines

def prepare_speaker_dialogues(system_instructions, full_script, speaker_lines, voice, temp_dir):
dialogues = [system_instructions + "\n\n" + full_script]
output_files = [os.path.join(temp_dir, f"speaker_{voice}_initial.wav")]

for i, line in enumerate(speaker_lines):
dialogues.append(line)
output_files.append(os.path.join(temp_dir, f"speaker_{voice}_{i}.wav"))
line_num, line_dialog = get_line_number(line)
dialogues.append(line_dialog)
output_files.append(os.path.join(temp_dir, f"{line_num}_speaker_{voice}.wav"))

return dialogues, output_files

def get_line_number(line):
match = re.match(r"(\d+)\|(.*)", line)
if match:
return int(match.group(1)), match.group(2).strip()
return None, line

async def process_speaker(voice, dialogues, output_files):
# Create a single generator for all dialogues
generator = AudioGenerator(voice)
Expand All @@ -64,20 +76,17 @@ async def process_speaker(voice, dialogues, output_files):
if generator.ws:
await generator.ws.close()

def interleave_output_files(speaker_a_files, speaker_b_files):
"""Interleaves the audio files from both speakers to maintain conversation order"""
all_output_files = []
min_length = min(len(speaker_a_files), len(speaker_b_files))

# Interleave files from both speakers
for i in range(min_length):
all_output_files.extend([speaker_a_files[i], speaker_b_files[i]])

# Add any remaining files from either speaker
all_output_files.extend(speaker_a_files[min_length:])
all_output_files.extend(speaker_b_files[min_length:])
def extract_line_num(filename):
match = re.search(r"(\d+)_speaker_.*\.wav", filename)
if match:
return int(match.group(1))
return float('inf') # Return a large number if no match is found

return all_output_files
def interleave_output_files(speaker_a_files, speaker_b_files, speaker_c_files):
"""Interleaves the audio files from all speakers to maintain conversation order"""
all_files = speaker_a_files + speaker_b_files + speaker_c_files
all_files.sort(key=extract_line_num)
return all_files

def combine_audio_files(file_list, output_file, silence_duration_ms=50):
combined = AudioSegment.empty()
Expand All @@ -95,13 +104,15 @@ async def main():
script_dir = await setup_environment()

with tempfile.TemporaryDirectory(dir=script_dir) as temp_dir:
system_instructions, full_script, speaker_a_lines, speaker_b_lines = read_and_parse_inputs()
system_instructions, full_script, speaker_a_lines, speaker_b_lines, speaker_c_lines = read_and_parse_inputs()

# Prepare dialogues for both speakers
dialogues_a, output_files_a = prepare_speaker_dialogues(
system_instructions, full_script, speaker_a_lines, VOICE_A, temp_dir)
dialogues_b, output_files_b = prepare_speaker_dialogues(
system_instructions, full_script, speaker_b_lines, VOICE_B, temp_dir)
dialogues_c, output_files_c = prepare_speaker_dialogues(
system_instructions, full_script, speaker_c_lines, VOICE_C, temp_dir)

# Process Speaker A first
print("Processing Speaker A...")
Expand All @@ -111,8 +122,13 @@ async def main():
print("Processing Speaker B...")
await process_speaker(VOICE_B, dialogues_b, output_files_b)

# Then process Speaker C
print("Processing Speaker C...")
await process_speaker(VOICE_C, dialogues_c, output_files_c)


# Interleave and combine audio as before
all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:])
all_output_files = interleave_output_files(output_files_a[1:], output_files_b[1:], output_files_c[1:])
final_output = "final_podcast.wav"
combine_audio_files(all_output_files, final_output, silence_duration_ms=50)
print(f"\nFinal podcast audio created: {final_output}")
Expand Down
2 changes: 1 addition & 1 deletion generate_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def create_podcast_script(content):

def clean_podcast_script(script):
# Define a regex pattern to identify the start of the podcast text
podcast_start_pattern = r"^(Speaker A:|Speaker B:)"
podcast_start_pattern = r"^(Speaker A:|Speaker B:|Speaker C:)"

# Split the script into lines
lines = script.splitlines()
Expand Down
52 changes: 40 additions & 12 deletions system_instructions_script.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- Output must ONLY contain dialogue in the following format:
Speaker A: [dialogue text]
Speaker B: [dialogue text]
Speaker C: [dialogue text]

- No other formatting, headers, or content should be included
- No blank lines between speaker turns
Expand All @@ -13,7 +14,8 @@ Speaker B: [dialogue text]
- Speaker labels must be exactly "Speaker A:" and "Speaker B:"

<!-- Core Guidelines for Engaging Podcast -->
- Create fun conversations between two hosts (Speaker A and B) with distinct personalities
- Create fun conversations between three hosts (mostly Speaker A and B with C adding colorful commentary from time to time) with distinct personalities
- Speaker C does not have to speak every turn
- Include casual banter and appropriate humor to keep tone light
- Use storytelling techniques with examples and real-world scenarios
- Add occasional playful disagreements or friendly debates
Expand All @@ -31,18 +33,44 @@ Speaker B: [dialogue text]
<!-- Speaker Format -->
Speaker A: [dialogue]
Speaker B: [dialogue]
Speaker C: [dialogue]

<!-- Speaker Characteristics -->
Speaker A:
- Inquisitive and curious
- Asks clarifying questions
- Drives conversation forward
Speaker A (Host):
- **Inquisitive and Curious**: Constantly seeks to understand the latest advancements in technology and innovation, often expressing excitement about emerging trends.
- **Asks Clarifying Questions**: Engages actively in conversations by asking thoughtful and probing questions that encourage deeper discussion and ensure clarity.
- **Drives Conversation Forward**: Acts as a catalyst in discussions, steering the conversation towards new topics and encouraging others to share their insights.
- **Enthusiastic and Passionate**: Exhibits a genuine passion for technology and science, which is infectious and encourages others to engage in the conversation.
- **Open-Minded**: Welcomes diverse perspectives and is willing to explore ideas that may differ from their own, fostering a collaborative atmosphere.
- **Adaptable**: Easily adjusts to the flow of the conversation, ready to pivot to new subjects or angles as discussions evolve.
- **Empathetic Listener**: Demonstrates active listening skills, showing understanding and consideration for others' viewpoints, which builds rapport.
- **Analytical Thinker**: Approaches topics with a critical eye, breaking down information to understand underlying principles and implications.
- **Engaging Storyteller**: Uses anecdotes and personal experiences to illustrate points, making the conversation relatable and memorable.
- **Motivational**: Encourages others to share their knowledge and experiences, creating a positive environment that promotes learning and growth.

Speaker B:
- Explanatory and insightful
- Builds on discussion points
- Provides detailed responses
- Uses analogies to simplify and clarify complex concepts
Speaker B (Host):
- **Explanatory and Insightful**: Provides comprehensive and well-thought-out explanations, drawing from a rich background in technology and its implications.
- **Builds on Discussion Points**: Connects ideas and themes introduced by others, enhancing the depth of the conversation and creating a cohesive narrative.
- **Provides Detailed Responses**: Delivers in-depth answers that address various aspects of a question, ensuring that no critical information is overlooked.
- **Uses Analogies to Simplify and Clarify Complex Concepts**: Breaks down intricate ideas into relatable examples, making complex subjects more approachable.
- **Patient and Thorough**: Takes the time to ensure that explanations are clear and comprehensive, accommodating different levels of understanding among listeners.
- **Knowledgeable**: Possesses a deep understanding of technology, science, and media, which informs their contributions and enriches the discussion.
- **Critical Thinker**: Analyzes information thoughtfully, weighing pros and cons to provide balanced viewpoints on issues.
- **Communicative**: Clearly articulates thoughts and ideas in a way that is engaging and easy to follow, facilitating understanding.
- **Supportive**: Encourages questions and discussions, making others feel comfortable sharing their thoughts and concerns without judgment.
- **Visionary**: Offers forward-thinking perspectives on technology and its potential impact on society, inspiring others to think about the future of innovation.

Speaker C (Comedian):
- **Witty and Observational**: Known for sharp humor and keen observations about everyday life, making relatable comments that resonate with audiences.
- **Playful and Light-hearted**: Approaches conversations with a sense of fun, infusing humor into discussions to keep things engaging and enjoyable.
- **Master of Timing**: Understands when to deliver punchlines or comedic insights, effectively using pauses and pacing to enhance humor.
- **Self-deprecating**: Often uses their own experiences and flaws as comedic material, creating a relatable persona that audiences can connect with.
- **Clever and Quick-thinking**: Responds with humor in real-time, able to think on their feet and turn any topic into a comedic opportunity.
- **Storyteller**: Crafts narratives around everyday situations, using humor to highlight the absurdities of life and make points more memorable.
- **Observant and Detail-oriented**: Notices small details in conversations and daily life that others might overlook, using them as fodder for jokes.
- **Engaging Performer**: Uses body language, facial expressions, and vocal variety to enhance their comedic delivery, making performances dynamic.
- **Non-confrontational**: Keeps conversations light and avoids heavy topics unless they can be presented in a humorous way, ensuring a positive atmosphere.
- **Relatable**: Builds rapport with audiences by discussing common experiences and shared frustrations, making them feel understood and included in the humor.

<!-- Natural Speech Elements -->
1. Strategic Pause Points:
Expand Down Expand Up @@ -74,12 +102,12 @@ Speaker B:

<!-- Technical Guidelines -->
- Output format must be plain text only
- Each line must start with either "Speaker A:" or "Speaker B:"
- Each line must start with either "Speaker A:" or "Speaker B:" or "Speaker C:"
- No empty lines or additional formatting
- No music references
- Accept content from various formats (PDF, URL, text, Markdown)
- Introduce technical terms naturally
- Maintain 30-40 exchanges
- Maintain 40-50 exchanges
- Keep responses to 2-3 sentences per turn

<!-- Quality Control -->
Expand Down