Spaces:

ashok2216
/

youtube-data_scraper

Build error

File size: 4,508 Bytes

4292ffa

import time
import pprint
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import csv
from youtube_comment_scraper_python import *
import pandas as pd
import plotly.express as px
import re
import streamlit as st

st.title('Youtube Channel Analysis')
st.write('Youtube WebScrap')


# # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))


url = st.text_input('Paste the Youtube Channel Link',"")
if not url:
  st.warning('Please input a Link.')
  st.stop()
st.success('Thank you for inputting a link.')
# url ='https://www.youtube.com/@YasoobKhalid/videos'
name = re.compile(r"[A-Z]\w+")
inp = name.findall(url)
out = inp[0]
st.write('Getting Data from', out, 'channel')
driver.get(url)

# url = input('Enter Youtube Video Url- ')
# driver.get(url)
# # "https://www.youtube.com/@YasoobKhalid/videos"
# channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text

WAIT_IN_SECONDS = 5
last_height = driver.execute_script("return document.documentElement.scrollHeight")

while True:
    # Scroll to the bottom of page
    driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
    # Wait for new videos to show up
    time.sleep(WAIT_IN_SECONDS)
    
    # Calculate new document height and compare it with last height
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
titles = driver.find_elements(By.ID, "video-title")
links = driver.find_elements(By.ID, "video-title-link")
# likes = driver.find_elements(By.ID, "video-title-link-likes")

videos = []
for title, view, thumb, link in zip(titles, views, thumbnails, links):
    video_dict = {
        'title': title.text,
        'views': view.text,
        # 'likes': likes.text,
        'thumbnail': thumb.get_attribute('src'),
        'link': link.get_attribute('href')
    }
    videos.append(video_dict)

print(videos)

to_csv = videos
keys = to_csv[0].keys()

with open('output/people.csv', 'w', newline='', encoding='utf-8') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(to_csv)
df = pd.read_csv('output/people.csv')
st.dataframe(df)

count = st.slider('Select Lower Video Count', 0, 607, 100)
st.write("You selected", count, 'Videos')

fig = px.bar(df,
    x="title",
    y="views", height=600
)
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
# fig.update_yaxes(tickvals=['10k', '22k', '29k', '56k'])
tab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])
with tab1:
    # Use the Streamlit theme.
    # This is the default. So you can also omit the theme argument.
    st.plotly_chart(fig, theme="streamlit", use_container_width=True)
with tab2:
    # Use the native Plotly theme.
    st.plotly_chart(fig, theme=None, use_container_width=True)

# ----------------------------------------------------------------------------COMMENTS------------------------------------------------------------------------------


# url = input('Enter Youtube Video Url- ')
# youtube.open(url)
# youtube.keypress("pagedown")

# data = []
# currentpagesource=youtube.get_page_source()
# lastpagesource=''

# while(True):
#     if(lastpagesource==currentpagesource):
#         break
        
#     lastpagesource=currentpagesource
#     response=youtube.video_comments()

#     for c in response['body']:
#         data.append(c)
        
#     youtube.scroll()
#     currentpagesource=youtube.get_page_source()


# df = pd.DataFrame(data)

# df = df.replace('\n',' ', regex=True)

# df = df[['Comment', 'Likes']].drop_duplicates(keep="first") 
# # df = df[['Likes']].drop_duplicates(keep="first") 

# df.to_csv('output/data.csv',index=False) 

# df.head()