diff --git a/TikTok/TikTok_Get_liked_videos_transcripts.ipynb b/TikTok/TikTok_Get_liked_videos_transcripts.ipynb new file mode 100644 index 0000000000..b69deaa9f5 --- /dev/null +++ b/TikTok/TikTok_Get_liked_videos_transcripts.ipynb @@ -0,0 +1,578 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e04c2a66-0232-4311-8445-d69e43f287e7", + "metadata": { + "execution": { + "iopub.execute_input": "2022-01-07T21:15:57.009861Z", + "iopub.status.busy": "2022-01-07T21:15:57.009532Z", + "iopub.status.idle": "2022-01-07T21:15:57.013136Z", + "shell.execute_reply": "2022-01-07T21:15:57.012559Z", + "shell.execute_reply.started": "2022-01-07T21:15:57.009826Z" + }, + "papermill": {}, + "tags": [] + }, + "source": [ + "\"TikTok.png\"" + ] + }, + { + "cell_type": "markdown", + "id": "47c20033-2e43-4c30-9d89-6fd47bfa98e9", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "# TikTok - Get liked videos by profile\n", + "Give Feedback | Bug report" + ] + }, + { + "cell_type": "markdown", + "id": "6ec59f5e-ac9e-4ab0-a0e9-51b01b38dcaa", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Tags:** #tiktok #videos #snippet #content" + ] + }, + { + "cell_type": "markdown", + "id": "c40e4f30-0b07-4a3b-b39b-7cb460053f74", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Author:** [Alex Nodeland](https://www.linkedin.com/in/alexnodeland/)" + ] + }, + { + "cell_type": "markdown", + "id": "6861607a-c840-4f0e-b03c-346e3351972a", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Last update:** 2023-06-21 (Created: 2022-06-21)" + ] + }, + { + "cell_type": "markdown", + "id": "naas-description", + "metadata": { + "papermill": {}, + "tags": [ + "description" + ] + }, + "source": [ + "**Description:** This notebook provides a script to retrieve a list of the liked videos of a given user on the popular social media platform, TikTok." + ] + }, + { + "cell_type": "markdown", + "id": "618b546f-943a-498c-9eb6-637a1e992f21", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Input" + ] + }, + { + "cell_type": "markdown", + "id": "91fc2ccf-9a27-40f4-bb62-df542a468bc6", + "metadata": { + "execution": { + "iopub.execute_input": "2022-01-08T05:48:50.458541Z", + "iopub.status.busy": "2022-01-08T05:48:50.458187Z", + "iopub.status.idle": "2022-01-08T05:48:50.465249Z", + "shell.execute_reply": "2022-01-08T05:48:50.464699Z", + "shell.execute_reply.started": "2022-01-08T05:48:50.458451Z" + }, + "papermill": {}, + "tags": [] + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4f6774a6-ff96-4c39-87a7-69a1fd3777cc", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "import time\n", + "import os\n", + "import asyncio\n", + "import pandas as pd\n", + "import nest_asyncio\n", + "from openai import OpenAI\n", + "try:\n", + " from selenium import webdriver\n", + " from selenium.webdriver.common.by import By\n", + " from selenium.webdriver.support.ui import WebDriverWait\n", + " from selenium.webdriver.support import expected_conditions as EC\n", + "except:\n", + " !pip install --user PyTikTokAPI\n", + " from selenium import webdriver\n", + " from selenium.webdriver.common.by import By\n", + " from selenium.webdriver.support.ui import WebDriverWait\n", + " from selenium.webdriver.support import expected_conditions as EC\n", + "try:\n", + " from bs4 import BeautifulSoup\n", + "except:\n", + " !pip install --user beautifulsoup4\n", + " from bs4 import BeautifulSoup\n", + "try:\n", + " from tiktokdl.download_post import get_post\n", + "except:\n", + " !pip install --user tiktok-dlpy\n", + " !python -m playwright install --with-deps\n", + " from tiktokdl.download_post import get_post" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5ed8dc04", + "metadata": {}, + "outputs": [], + "source": [ + "nest_asyncio.apply()" + ] + }, + { + "cell_type": "markdown", + "id": "958d3406-bf4c-48b9-b56b-ad4dad7caaae", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Setup variables\n", + "\n", + "**Mandatory**\n", + "- `USERNAME`: The username of the TikTok user whose liked videos you want to retrieve.\n", + "- `OPENAI_API_KEY`: Your OpenAI API key.\n", + "\n", + "**Optional**\n", + "- `LIMIT`: The maximum number of liked videos to retrieve. Default is None (no limit)." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1a91eab8", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "#Mandatory\n", + "USERNAME = ''\n", + "OPENAI_API_KEY = ''\n", + "\n", + "#Optional\n", + "LIMIT = 3" + ] + }, + { + "cell_type": "markdown", + "id": "43fff853-fdf1-4949-a781-43b31e65cce9", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "da6d5ea2", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Web automations" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f40d0a0b", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "def click_liked_tab(driver):\n", + " try:\n", + " liked_tab = WebDriverWait(driver, 10).until(\n", + " EC.element_to_be_clickable((By.CSS_SELECTOR, \"p[data-e2e='liked-tab']\"))\n", + " )\n", + " liked_tab.click()\n", + " print(\"Clicked on Liked tab successfully\")\n", + " except Exception as e:\n", + " print(f\"Failed to click on Liked tab: {e}\")\n", + "\n", + "def scroll_to_load_all_videos(driver, max_scrolls=10):\n", + " last_height = driver.execute_script(\"return document.body.scrollHeight\")\n", + " scrolls = 0\n", + " \n", + " while scrolls < max_scrolls:\n", + " driver.execute_script(\"window.scrollTo(0, document.body.scrollHeight);\")\n", + " time.sleep(2) # Wait for page to load\n", + " \n", + " new_height = driver.execute_script(\"return document.body.scrollHeight\")\n", + " if new_height == last_height:\n", + " break\n", + " last_height = new_height\n", + " scrolls += 1\n", + " print(f\"Scroll {scrolls}/{max_scrolls} completed\")" + ] + }, + { + "cell_type": "markdown", + "id": "43d26535", + "metadata": {}, + "source": [ + "### Scraping" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "60ff8ee3", + "metadata": {}, + "outputs": [], + "source": [ + "def find_video_elements(soup):\n", + " # Strategy 1: Look for the main container with data-e2e attribute\n", + " main_container = soup.find('div', attrs={'data-e2e': 'user-liked-item-list'})\n", + " if main_container:\n", + " # Find all direct child divs that contain video items\n", + " videos = main_container.find_all('div', attrs={'data-e2e': 'user-liked-item'}, recursive=False)\n", + " if videos:\n", + " return videos\n", + "\n", + " # Strategy 2: If the main container isn't found, search for video items directly\n", + " videos = soup.find_all('div', attrs={'data-e2e': 'user-liked-item'})\n", + " if videos:\n", + " return videos\n", + "\n", + " # Strategy 3: Look for divs with specific class and role\n", + " videos = soup.find_all('div', class_=lambda x: x and 'DivContainer' in x, attrs={'role': 'button', 'aria-label': 'Watch in full screen'})\n", + " if videos:\n", + " return videos\n", + "\n", + " # Strategy 4: Fallback to a more general approach\n", + " videos = soup.find_all('div', class_=lambda x: x and ('ItemContainer' in x or 'VideoFeed' in x))\n", + " return videos\n", + "\n", + "def extract_video_info(video_element):\n", + " try:\n", + " video_link = video_element.find('a')['href']\n", + " return {'url': video_link}\n", + " except Exception as e:\n", + " print(f\"Error extracting video info: {e}\")\n", + " return None\n", + " \n", + "async def download_tiktok_video(url, output_filename):\n", + " try:\n", + " video = await get_post(url)\n", + " print(f\"Downloading {url} to {output_filename}\")\n", + " if video.file_path:\n", + " if os.path.abspath(video.file_path) != os.path.abspath(output_filename):\n", + " os.rename(video.file_path, output_filename)\n", + " print(f\"Downloaded {url} to {output_filename}\")\n", + " return True\n", + " else:\n", + " print(f\"Error: No file path found for {url}\")\n", + " return False\n", + " except Exception as e:\n", + " print(f\"Error downloading {url}: {e}\")\n", + " return False" + ] + }, + { + "cell_type": "markdown", + "id": "b43aad2e", + "metadata": {}, + "source": [ + "### Transcription" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "12b372c8", + "metadata": {}, + "outputs": [], + "source": [ + "def transcribe_audio(file_path, client):\n", + " try:\n", + " with open(file_path, \"rb\") as audio_file:\n", + " response = client.audio.transcriptions.create(\n", + " model=\"whisper-1\",\n", + " file=audio_file\n", + " )\n", + " return response.text\n", + " except Exception as e:\n", + " print(f\"Error transcribing {file_path}: {e}\")\n", + " return \"Error\"" + ] + }, + { + "cell_type": "markdown", + "id": "be6a2481", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Script" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e9a0144c", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "async def main(username, limit=None):\n", + " driver = webdriver.Chrome()\n", + " \n", + " # Navigate directly to the user's profile\n", + " driver.get(f\"https://www.tiktok.com/@{username}\")\n", + " print(f\"Navigated to https://www.tiktok.com/@{username}\")\n", + "\n", + " # Click on the Liked tab\n", + " click_liked_tab(driver)\n", + "\n", + " # Scroll to load more videos\n", + " scroll_to_load_all_videos(driver)\n", + "\n", + " page_source = driver.page_source\n", + " soup = BeautifulSoup(page_source, 'html.parser')\n", + "\n", + " videos = find_video_elements(soup)\n", + " print(f\"Found {len(videos)} videos\")\n", + "\n", + " # Create a DataFrame to store video information\n", + " video_data = []\n", + "\n", + " # Set up OpenAI client\n", + " client = OpenAI(api_key=OPENAI_API_KEY)\n", + "\n", + " for video in videos:\n", + " if limit and len(video_data) >= limit:\n", + " break\n", + " video_info = extract_video_info(video)\n", + " if video_info:\n", + " url = video_info['url']\n", + " video_id = url.split('/')[-1]\n", + " video_filename = f'{video_id}.mp4'\n", + " \n", + " try:\n", + " # Download the TikTok video\n", + " if await download_tiktok_video(url, video_filename):\n", + " # Transcribe the video\n", + " transcription = transcribe_audio(video_filename, client)\n", + " video_info['transcript'] = transcription\n", + " else:\n", + " video_info['transcript'] = 'Error'\n", + " finally:\n", + " # Always try to remove the video file, even if an error occurred\n", + " if os.path.exists(video_filename):\n", + " os.remove(video_filename)\n", + " \n", + " video_data.append(video_info)\n", + "\n", + " driver.quit()\n", + "\n", + " df = pd.DataFrame(video_data)\n", + " df.to_csv(f'{username}_liked_videos_transcripts.csv', index=False, encoding='utf-8')\n", + " print(f\"Data saved to {username}_liked_videos_transcripts.csv\")\n", + "\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "id": "5f6714c5", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "4295fc59", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "### Get liked videos" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "08eaaac0", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Navigated to https://www.tiktok.com/@jeremyravenel\n", + "Clicked on Liked tab successfully\n", + "Scroll 1/10 completed\n", + "Scroll 2/10 completed\n", + "Scroll 3/10 completed\n", + "Scroll 4/10 completed\n", + "Scroll 5/10 completed\n", + "Scroll 6/10 completed\n", + "Scroll 7/10 completed\n", + "Scroll 8/10 completed\n", + "Scroll 9/10 completed\n", + "Scroll 10/10 completed\n", + "Found 267 videos\n", + "Downloading https://www.tiktok.com/@danzabs/video/7383037117007465774 to 7383037117007465774.mp4\n", + "Downloaded https://www.tiktok.com/@danzabs/video/7383037117007465774 to 7383037117007465774.mp4\n", + "Downloading https://www.tiktok.com/@20vc_tok/video/7357026640314043654 to 7357026640314043654.mp4\n", + "Downloaded https://www.tiktok.com/@20vc_tok/video/7357026640314043654 to 7357026640314043654.mp4\n", + "Downloading https://www.tiktok.com/@haileyknox/video/7382280210785357087 to 7382280210785357087.mp4\n", + "Downloaded https://www.tiktok.com/@haileyknox/video/7382280210785357087 to 7382280210785357087.mp4\n", + "Data saved to jeremyravenel_liked_videos_transcripts.csv\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
urltranscript
0https://www.tiktok.com/@danzabs/video/73830371...When I wrote Because I Got High, I was ready t...
1https://www.tiktok.com/@20vc_tok/video/7357026...But Rippling has done, they have a campaign ri...
2https://www.tiktok.com/@haileyknox/video/73822...One, two, one, two. One, two, one, two. One, t...
\n", + "
" + ], + "text/plain": [ + " url \\\n", + "0 https://www.tiktok.com/@danzabs/video/73830371... \n", + "1 https://www.tiktok.com/@20vc_tok/video/7357026... \n", + "2 https://www.tiktok.com/@haileyknox/video/73822... \n", + "\n", + " transcript \n", + "0 When I wrote Because I Got High, I was ready t... \n", + "1 But Rippling has done, they have a campaign ri... \n", + "2 One, two, one, two. One, two, one, two. One, t... " + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "liked_videos_transcripts = asyncio.run(main(USERNAME, LIMIT))\n", + "liked_videos_transcripts" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + }, + "naas": { + "notebook_id": "35f614b16e2f264d1d38670c7ca5c0fe010e4c1cd828f2812e8a58faaeaee557", + "notebook_path": "TikTok/TikTok_Get_videos_stats.ipynb" + }, + "papermill": { + "default_parameters": {}, + "environment_variables": {}, + "parameters": {}, + "version": "2.3.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}