Python how to get information about Youtube video from Youtube video id WITHOUT WEB SCRAPING

So I wanted to get information about the Youtube playlists I have created, so I used Google takeout and got a bunch of .csv files like this:

Playlist Id,Channel Id,Time Created,Time Updated,Title,Description,Visibility
LL--2uyGZ5xJWulPXe2p7YDw,UC--2uyGZ5xJWulPXe2p7YDw,2022-03-20 14:54:43 UTC,2022-04-24 05:42:54 UTC,Liked videos,,Private

Video Id,Time Added
JMFep9n0izQ,2022-04-24 05:42:54 UTC
XjeLE4A5Fu0,2022-04-24 05:27:53 UTC
7LPXA0yM8iE,2022-04-24 04:58:10 UTC
bT9jZfgRVvA,2022-04-24 04:56:04 UTC
nikDvZhCy10,2022-04-24 02:43:53 UTC
SA1ozeqslaM,2022-04-24 02:35:09 UTC
sEsEl56y-M0,2022-04-24 02:13:27 UTC
x8QpeE6ZHLA,2022-04-24 02:12:20 UTC
AcWZhBJd4-4,2022-04-24 01:12:46 UTC
G6w4muQ1aL0,2022-04-24 00:57:15 UTC
sM8bL3i1PpI,2022-04-24 00:47:26 UTC
6OdubOdFS-Y,2022-04-24 00:41:54 UTC
2Yu9dGlWi00,2022-04-24 00:38:53 UTC
99aLrgk-uqs,2022-04-24 00:35:43 UTC
Assa__Snvmw,2022-04-24 00:33:32 UTC
79a8AINRDjI,2022-04-23 18:08:21 UTC
z7oD9j814nQ,2022-04-23 18:00:25 UTC
tDxKhiJfgYk,2022-04-23 17:57:35 UTC
TduPxMSX62U,2022-04-23 17:46:39 UTC
p1qJ-ksU6aw,2022-04-23 17:03:03 UTC
E8QEen6GjDA,2022-04-23 14:09:38 UTC
BFld4EBO2RE,2022-04-23 01:19:39 UTC
 -_1Kqds7NAI,2022-04-22 16:28:50 UTC

The information is not what I wanted, so I have to resolve the identifiers, I chose web scraping because that’s the only method I know that is working as of 2022…

I have already written a fully working script that does exactly the job, I got the xpaths using F12 and CTRL+Shift+C,/kbd>, I did it all by myself, but it never completes its job and the process is time consuming, and the fault is completely not mine.

In short I am physically in China separated from the free world by the notorious GFW, I use ExpressVPN and it doesn’t slow down download or upload, but it increases latency terribly (with it the average ping latency is 256ms+), and there are occasional times when the VPN connection is cut off, it’s nothing I can’t fix but it does stop the execution of the code.

The code can’t run for 15 minutes without throwing TimeoutException and/or NoSuchElementExceptionand there is nothing I can do about it.

Code:

import json
import os
import re
import time
from pathlib import Path
from reprint import output
from selenium import webdriver
from selenium.webdriver import FirefoxProfile
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.firefox.options import Options

options = Options()
options.add_argument("--log-level=3")
options.add_argument("--mute-audio")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
capabibilties = DesiredCapabilities().FIREFOX
capabibilties['pageLoadStrategy'] = 'eager'
profile = FirefoxProfile()
profile.set_preference("http.response.timeout", 1)
profile.set_preference("dom.max_script_run_time", 0)
profile.set_preference('permissions.default.stylesheet', 2)
profile.set_preference('permissions.default.image', 2)
profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false')
profile.set_preference("permissions.default.script", 2)
profile.set_preference("javascript.enabled", False)


Firefox = webdriver.Firefox(capabilities=capabibilties, options=options)

files = list(Path('C:/Users/Estranger/Downloads/Takeout/YouTube and YouTube Music/playlists').glob('*.csv'))
wait = WebDriverWait(Firefox, 10)
playlists = dict()

MONTHS = [
    'Jan', 'Feb', 'Mar',
    'Apr', 'May', 'Jun',
    'Jul', 'Aug', 'Sep',
    'Oct', 'Nov', 'Dec'
]
UNITS = {'K': 1e3, 'M': 1e6, 'B': 1e9}
Firefox.get('https://www.youtube.com')
time.sleep(3)
for file in files:
    name = file.name[:-4]
    playlists[name] = []
    lines = file.read_text(encoding='utf8').splitlines()[4:-1]
    for line in lines:
        video_id, timestamp = line.split(',')
        video_link = 'https://youtu.be/' + video_id
        timestamp = timestamp.replace(' ','T',1)[:-4]+'Z'
        Firefox.get(video_link)
        unavailable = False
        try:
            time.sleep(5)
            Firefox.find_element('xpath', '//ytd-background-promo-renderer/div[1]/div')
            unavailable = True
        except NoSuchElementException:
            pass
        
        if unavailable:
            continue
        
        wait.until(EC.visibility_of_element_located((By.XPATH, '//h1[contains(@class, "title")]/yt-formatted-string')))
        video_title = Firefox.find_element('xpath', '//h1[contains(@class, "title")]/yt-formatted-string').text
        channel = Firefox.find_element('xpath', '//div[contains(@class,"ytd-channel-name")]//a')
        channel_name = channel.text
        channel_link = channel.get_attribute('href')
        view_count = Firefox.find_element('xpath', '//span[contains(@class, "view-count")]')
        view_count = int(view_count.text[:-6].replace(',', ''))
        upload_date = Firefox.find_element('xpath', '//div[@id="info-strings"]/yt-formatted-string').text
        month, day, year = re.search('(w{3}) (d{1,2}), (d{4})', upload_date).groups()
        day = int(day)
        month = MONTHS.index(month)+1
        upload_date = f'{year}-{month:02d}-{day:02d}'
        like_count = Firefox.find_element('xpath', '//ytd-toggle-button-renderer[1]/a/*[@id="text"]').get_attribute('aria-label')
        like_count = int(like_count[:-6].replace(',', ''))
        channel_subscribers = Firefox.find_element('xpath', '//*[@id="owner-sub-count"]').text
        if channel_subscribers:
            channel_subscribers = channel_subscribers.split(' ')[0]
            if channel_subscribers[-1] in UNITS:
                unit = UNITS[channel_subscribers[-1]]
                number = float(channel_subscribers[:-1])
                channel_subscribers= int(number*unit)
            else:
                channel_subscribers = int(channel_subscribers)
        else:
            channel_subscribers = 0
        entry = {
            'video_title': video_title,
            'channel_name': channel_name,
            'upload_date': upload_date,
            'video_link': video_link,
            'view_count': view_count,
            'like_count': like_count,
            'channel_link': channel_link,
            'channel_subscribers': channel_subscribers
        }
        playlists[name].append(entry)

Path('D:/youtube_playlists.json').write_text(json.dumps(playlists, indent=4, ensure_ascii=False), encoding='utf8')

Can anyone suggest an alternative way to get the information I want? I need to resolve thousands of Youtube identifiers.

Leave a Comment