import time import pprint import csv from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By import csv from youtube_comment_scraper_python import * import pandas as pd import plotly.express as px import re import streamlit as st st.title('Youtube Channel Analysis') st.write('Youtube WebScrap') # # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) url = st.text_input('Paste the Youtube Channel Link',"") if not url: st.warning('Please input a Link.') st.stop() st.success('Thank you for inputting a link.') # url ='https://www.youtube.com/@YasoobKhalid/videos' name = re.compile(r"[A-Z]\w+") inp = name.findall(url) out = inp[0] st.write('Getting Data from', out, 'channel') driver.get(url) # url = input('Enter Youtube Video Url- ') # driver.get(url) # # "https://www.youtube.com/@YasoobKhalid/videos" # channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text WAIT_IN_SECONDS = 5 last_height = driver.execute_script("return document.documentElement.scrollHeight") while True: # Scroll to the bottom of page driver.execute_script("window.scrollTo(0, arguments[0]);", last_height) # Wait for new videos to show up time.sleep(WAIT_IN_SECONDS) # Calculate new document height and compare it with last height new_height = driver.execute_script("return document.documentElement.scrollHeight") if new_height == last_height: break last_height = new_height thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img') views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]') titles = driver.find_elements(By.ID, "video-title") links = driver.find_elements(By.ID, "video-title-link") # likes = driver.find_elements(By.ID, "video-title-link-likes") videos = [] for title, view, thumb, link in zip(titles, views, thumbnails, links): video_dict = { 'title': title.text, 'views': view.text, # 'likes': likes.text, 'thumbnail': thumb.get_attribute('src'), 'link': link.get_attribute('href') } videos.append(video_dict) print(videos) to_csv = videos keys = to_csv[0].keys() with open('output/people.csv', 'w', newline='', encoding='utf-8') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(to_csv) df = pd.read_csv('output/people.csv') st.dataframe(df) count = st.slider('Select Lower Video Count', 0, 607, 100) st.write("You selected", count, 'Videos') fig = px.bar(df, x="title", y="views", height=600 ) fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False) # fig.update_yaxes(tickvals=['10k', '22k', '29k', '56k']) tab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"]) with tab1: # Use the Streamlit theme. # This is the default. So you can also omit the theme argument. st.plotly_chart(fig, theme="streamlit", use_container_width=True) with tab2: # Use the native Plotly theme. st.plotly_chart(fig, theme=None, use_container_width=True) # ----------------------------------------------------------------------------COMMENTS------------------------------------------------------------------------------ # url = input('Enter Youtube Video Url- ') # youtube.open(url) # youtube.keypress("pagedown") # data = [] # currentpagesource=youtube.get_page_source() # lastpagesource='' # while(True): # if(lastpagesource==currentpagesource): # break # lastpagesource=currentpagesource # response=youtube.video_comments() # for c in response['body']: # data.append(c) # youtube.scroll() # currentpagesource=youtube.get_page_source() # df = pd.DataFrame(data) # df = df.replace('\n',' ', regex=True) # df = df[['Comment', 'Likes']].drop_duplicates(keep="first") # # df = df[['Likes']].drop_duplicates(keep="first") # df.to_csv('output/data.csv',index=False) # df.head()