-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathyoutube_id_to_timestamps.py
106 lines (93 loc) · 5.84 KB
/
youtube_id_to_timestamps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from youtube_transcript_api import YouTubeTranscriptApi
import json
from datetime import timedelta
import os
import time
import google.generativeai as genai
import tempfile
import json
import logging
DEFAULT_GEMINI_MODEL = "gemini-2.0-flash-exp"
class YoutubeIdToTimestamps:
def __init__(self, max_response_length):
genai.configure(api_key=os.environ.get("GEMINI_API_KEY", ""))
pr = os.environ.get("YT_TRANSCRIPT_PROXY", "")
self.proxies = None
if pr:
self.proxies = {
"https": pr.replace("http://", "https://"),
"http": pr.replace("https://", "http://"),
}
self.max_response_length = max_response_length
def _seconds_to_hhmmss(self, seconds):
td = timedelta(seconds=round(seconds))
return str(td)
def _get_transcript(self, youtube_id):
logging.info(f"{youtube_id} - making the request to get the transcript")
#transcript_list = YouTubeTranscriptApi.list_transcripts(youtube_id)
#transcript = next((item for item in transcript_list if item.get("is_generated")), transcript_list[0])
#if not transcript:
# logging.error("No auto generated transcript found")
data = YouTubeTranscriptApi.get_transcript(
youtube_id, languages=['en', 'es', 'pt', 'pt-PT', 'de', 'it', 'zh-Hant', 'zh-Hans', 'ja', 'ro', 'vi'], proxies=self.proxies
)
logging.info(f"{youtube_id} - got the transcript. First 5 objs: {data[:5]}")
transformed_data = [
{"text": item["text"], "start": self._seconds_to_hhmmss(item["start"])}
for item in data
]
return json.dumps(transformed_data, indent=4)
def _upload_to_gemini(self, file_content, mime_type=None):
with tempfile.NamedTemporaryFile(
suffix=".tmp", mode="w+", encoding="utf-8"
) as tmpfile:
tmpfile.write(file_content)
tmp_path = tmpfile.name
file = genai.upload_file(tmp_path, mime_type=mime_type)
return file
def _wait_for_files_active(self, files):
for name in (file.name for file in files):
file = genai.get_file(name)
while file.state.name == "PROCESSING":
time.sleep(10)
file = genai.get_file(name)
if file.state.name != "ACTIVE":
raise Exception(f"File {file.name} failed to process")
def get_timestamps(self, youtube_id):
file_content = self._get_transcript(youtube_id)
model = genai.GenerativeModel(
model_name=os.environ.get("GEMINI_MODEL", DEFAULT_GEMINI_MODEL),
generation_config={
"temperature": 1,
"top_p": 0.95,
"top_k": 40,
"max_output_tokens": 8192,
"response_mime_type": "text/plain",
},
)
files = [
self._upload_to_gemini(file_content, mime_type="text/plain"),
]
logging.info(f"{youtube_id} - Will wait for files to be active")
self._wait_for_files_active(files)
logging.info(f"{youtube_id} - Files were attached")
initial_message = {
"role": "user",
"parts": [
files[0],
"Attached you have the transcript of a YouTube video. It's a JSON file that has the following properties: text - what is said, meaning the actual transcript, start: the second of that fragment in the video.\n\nI want you to make a summary of the video based on that transcript data and also include the timestamps (make sure they are in HH:MM:SS format), meaning between what hour, minute and second in the video a generated topic/idea is mentioned. Make the topics short. Have lots of topics. Have one topic per line and each line starts with the timestamps.\n\nHere is an example of how your response look like. Pay attention to the format. This is the summary for another video:\n\n0:00 - Introduction\n13:03- 12 startups in 12 months\n36:37 - Photo AI\n1:12:53 - How to learn AI\n2:03:24 - Monetize your website\n3:01:34 - Productivity\n3:41:21 - Advice for young people\n\nDon't make it too granular. Extract the main ideas/chapters and present them. Only have a chapter at every few minutes, like in the example. Mention as timestamp the beginning of each chapter. See the provided example from above for a better understanding.\n",
],
}
MAX_RESPONSE_LENGTH = self.max_response_length
chat_session = model.start_chat(history=[])
response = chat_session.send_message(initial_message["parts"])
logging.info(f"{youtube_id} - First response received - {len(response.text)=}")
chat_session.history.append({"role": "model", "parts": [response.text]})
follow_up_message = f"That's good, but it's too granular. The full response must have less than {MAX_RESPONSE_LENGTH} characters, including new lines. Extract the main ideas/chapters and present them. Only have a chapter at every few minutes, like in the example. Mention as timestamp the beginning of each chapter. See the provided example from above for a better understanding. Answer only with the timestamps and chapters, nothing else and remember to make the response short enought to not exceed {MAX_RESPONSE_LENGTH} characters."
response = chat_session.send_message(follow_up_message)
logging.info(f"{youtube_id} - Seconds response received - {len(response.text)=}")
chat_session.history.append({"role": "model", "parts": [response.text]})
follow_up_message = f"Make it even shorter. Just merge chapters into bigger categories. Provide the final response. Only few chapters with just the big picture."
response = chat_session.send_message(follow_up_message)
logging.info(f"{youtube_id} - {response.text} - {len(response.text)=}")
return response.text[:MAX_RESPONSE_LENGTH]