diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 90fc9e9..eb68a4e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,4 +15,4 @@ jobs: key: ${{ secrets.KEY }} port: ${{ secrets.PORT }} script: | - ./scripts/deploy-youtb.sh + ./scripts/deploy-odevtube.sh diff --git a/README.md b/README.md index ce99bd3..640b132 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ - YouTube Data API v3를 활성화 - 프로젝트에서 신규 API 키를 생성 - 환경변수 YOUTUBE_API_KEY 설정 -- [package.json](https://github.com/kenu/youtb/blob/main/package.json)에 youtube-api를 추가 `"googleapis": "^134.0.0"` +- [package.json](https://github.com/kenu/odevtube/blob/main/package.json)에 youtube-api를 추가 `"googleapis": "^134.0.0"` ``` { diff --git a/package.json b/package.json index baf3bbd..10a21c0 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,8 @@ "node-html-parser": "^6.1.13", "nodemailer": "^6.9.13", "openai": "^4.38.2", - "sequelize": "^6.37.3" + "sequelize": "^6.37.3", + "youtube-transcript": "^1.2.1" }, "devDependencies": { "@babel/core": "^7.24.4", diff --git a/scripts/deploy-scripts.sh b/scripts/deploy-scripts.sh index 930de11..84f13ba 100644 --- a/scripts/deploy-scripts.sh +++ b/scripts/deploy-scripts.sh @@ -1,12 +1,12 @@ #!/bin/zsh . ~/.zshrc -cd ~/git/youtb +cd ~/git/odevtube git pull origin main pnpm i -cd ~/git/youtb/web +cd ~/git/odevtube/web pnpm i -pm2 restart youtb --update-env +pm2 restart odevtube --update-env sleep 2 pm2 list diff --git a/tests/transcript.test.js b/tests/transcript.test.js index cb7778c..196e1de 100644 --- a/tests/transcript.test.js +++ b/tests/transcript.test.js @@ -1,7 +1,21 @@ -import fetchTranscript from '../web/utils/transcript.js' +import { YoutubeTranscript } from "youtube-transcript"; -test('', async() => { - const videoId = 'ulqHnefBFMM' - const transcript = await fetchTranscript(videoId) - expect(transcript).not.toBeNull() -}) +const videoId = "vSIb2sPdu9U"; +test("getTextOnly", async () => { + const transcript = await YoutubeTranscript.fetchTranscript(videoId); + expect(transcript).not.toBeNull(); + const fullText = getTextOnly(transcript); + expect(fullText).toContain("인데"); +}); + +function getTextOnly(transcript) { + return transcript.map((item) => item.text).join(" "); +} + +test('remove [Object, Object]', async () => { + const pattern = /(니다|하죠|하겠죠|고요|까요|네요|데요|세요|아요|어요)\s/g + const transcript = await YoutubeTranscript.fetchTranscript(videoId) + let fullText = transcript.map((item) => item.text).join(' ') + fullText = fullText.replaceAll(pattern, '$1. ') + expect(fullText).not.toContain('Object') +}); diff --git a/web/bin/youtb.js b/web/bin/odevtube.js similarity index 100% rename from web/bin/youtb.js rename to web/bin/odevtube.js diff --git a/web/package.json b/web/package.json index 641d070..97b1885 100644 --- a/web/package.json +++ b/web/package.json @@ -4,7 +4,7 @@ "private": true, "type": "module", "scripts": { - "dev": "nodemon ./bin/youtb.js" + "dev": "nodemon ./bin/odevtube.js" }, "dependencies": { "body-parser": "^1.20.2", diff --git a/web/routes/index.js b/web/routes/index.js index 1137dab..05cbe50 100644 --- a/web/routes/index.js +++ b/web/routes/index.js @@ -1,6 +1,7 @@ import express from 'express' import dayjs from 'dayjs' import passport from 'passport' +import { YoutubeTranscript } from 'youtube-transcript' import dao from '../../youtubeDao.js' const router = express.Router() @@ -119,7 +120,6 @@ function building(list) { }) } -import fetchTranscript from '../utils/transcript.js' import summarize from '../utils/summary.js' router.get('/transcript/:videoId', async function (req, res, next) { const videoId = req.params.videoId @@ -129,33 +129,35 @@ router.get('/transcript/:videoId', async function (req, res, next) { res.json({ videoId, summary: item.summary, text: item.content }) return } - // if empty get from youtube web - // save with videoId + await upsertTranscript(res, videoId) + +}) + +async function upsertTranscript(res, videoId) { try { - const pattern = /(니다|하죠|하겠죠|고요|까요|네요|데요|세요|아요|어요)\s/g - let transcript = await fetchTranscript(videoId) - transcript = transcript.replaceAll(pattern, '$1. ') - const cmd = - "3줄 단문에, 명사형 어미로 요약(예)'있습니다.' 대신 '있음', '설명드립니다' 대신 '설명함' :\n" + const pattern = /(니다|이죠|하죠|하겠죠|고요|까요|네요|데요|세요|에요|아요|어요)\s/g + const transcript = await YoutubeTranscript.fetchTranscript(videoId) + let fullText = transcript.map((item) => item.text).join(' ') + fullText = fullText.replaceAll(pattern, '$1. ') + const cmd = "3줄 단문에, 명사형 어미로 요약(예)'있습니다.' 대신 '있음', '설명드립니다' 대신 '설명함' :\n" const messages = [ { role: 'system', - content: cmd + transcript, + content: cmd + fullText, }, ] - const result = await summarize(messages) - let summary = result[0].message.content + const summary = await summarize(messages) await dao.createTranscript({ videoId, - content: transcript, + content: fullText, summary: summary, }) - res.json({ videoId, summary, text: transcript }) + res.json({ videoId, summary, text: fullText }) } catch (error) { console.log(error) res.json({ videoId, summary: '', text: 'Not Available' }) } -}) +} router.get('/login', function (req, res) { res.render('login') diff --git a/web/utils/summary.js b/web/utils/summary.js index e5d0622..74d554d 100644 --- a/web/utils/summary.js +++ b/web/utils/summary.js @@ -4,10 +4,10 @@ async function summarize(messages) { const openai = new OpenAI() const completion = await openai.chat.completions.create({ messages: messages, - model: 'gpt-3.5-turbo', + model: 'gpt-4o-mini', }) - return completion.choices + return completion.choices[0].message.content } export default summarize diff --git a/web/utils/transcript.js b/web/utils/transcript.js deleted file mode 100644 index 7292519..0000000 --- a/web/utils/transcript.js +++ /dev/null @@ -1,51 +0,0 @@ -import { parse } from 'node-html-parser' - -async function fetchTranscript(videoId) { - const videoPageHtml = await fetchHtml( - `https://www.youtube.com/watch?v=${videoId}` - ) - const playerScript = findPlayerScript(videoPageHtml) - const captionsUrl = extractCaptionsUrl(playerScript) - const captionsXml = await fetchXml( - captionsUrl.replace('lang=de-DE', 'lang=ko-KR') - ) - const transcript = extractTranscriptFromXml(captionsXml) - return transcript -} - -async function fetchHtml(url) { - const response = await fetch(url) - const html = await response.text() - return parse(html) -} - -function findPlayerScript(html) { - const scripts = html.getElementsByTagName('script') - return scripts.find((script) => - script.textContent.includes('var ytInitialPlayerResponse = {') - ) -} - -function extractCaptionsUrl(playerScript) { - const dataString = playerScript.textContent - ?.split('var ytInitialPlayerResponse = ')?.[1] - ?.slice(0, -1) - const data = JSON.parse(dataString.trim()) - return data.captions.playerCaptionsTracklistRenderer.captionTracks[0].baseUrl -} - -async function fetchXml(url) { - const response = await fetch(url) - const xml = await response.text() - return parse(xml) -} - -function extractTranscriptFromXml(xml) { - const chunks = xml.getElementsByTagName('text') - return Array.from(chunks).reduce( - (transcript, chunk) => transcript + chunk.textContent + ' ', - '' - ) -} - -export default fetchTranscript diff --git a/web/views/admin/channel.ejs b/web/views/admin/channel.ejs index 8e6f3ea..9c0ac67 100644 --- a/web/views/admin/channel.ejs +++ b/web/views/admin/channel.ejs @@ -23,7 +23,7 @@ | food | dev ko | dev en - + diff --git a/web/views/admin/video.ejs b/web/views/admin/video.ejs index 7a23815..cc505be 100644 --- a/web/views/admin/video.ejs +++ b/web/views/admin/video.ejs @@ -23,7 +23,7 @@ | food | dev ko | dev en - + diff --git a/web/views/index.ejs b/web/views/index.ejs index 958da10..d1dcdf9 100644 --- a/web/views/index.ejs +++ b/web/views/index.ejs @@ -12,7 +12,7 @@
- +