ashok2216 commited on
Commit
4292ffa
·
verified ·
1 Parent(s): 55c1506

Upload 13 files

Browse files
.github/workflows/deploy.yml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Deploy
2
+ on:
3
+ push:
4
+ branches: main
5
+ pull_request:
6
+ branches: main
7
+
8
+ jobs:
9
+ deploy:
10
+ name: Deploy
11
+ runs-on: ubuntu-latest
12
+ permissions:
13
+ id-token: write # Needed for auth with Deno Deploy
14
+ contents: read # Needed to clone the repository
15
+
16
+ steps:
17
+ - name: Clone repository
18
+ uses: actions/checkout@v3
19
+
20
+ - name: Install Node.js
21
+ uses: actions/setup-node@v3
22
+ with:
23
+ node-version: lts/*
24
+
25
+ - name: Build step
26
+ run: npm install && npm run build # 📝 Update the build command(s)
27
+
28
+ - name: Upload to Deno Deploy
29
+ uses: denoland/deployctl@v1
30
+ with:
31
+ project: "expensive-dolphin-10"
32
+ entrypoint: "index.js" # 📝 Update the entrypoint
33
+ root: "." # 📝 Update the root
Dockerfile.sql ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ ENV PYTHONUNBUFFERED=1 \
4
+ PYTHONDONTWRITEBYTECODE=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
7
+ PIP_DEFAULT_TIMEOUT=120 \
8
+ LC_ALL=C.UTF-8 \
9
+ LANG=C.UTF-8
10
+
11
+ # we probably need build tools?
12
+ RUN apt-get update \
13
+ && apt-get install --yes --no-install-recommends \
14
+ gcc \
15
+ g++ \
16
+ build-essential \
17
+ python3-dev
18
+
19
+ WORKDIR /app
20
+
21
+ # if we have a packages.txt, install it
22
+ COPY packages.txt packages.txt
23
+ RUN xargs -a packages.txt apt-get install --yes
24
+
25
+ COPY requirements.txt requirements.txt
26
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
27
+
28
+ EXPOSE 8501
29
+
30
+ COPY . .
31
+
32
+ CMD ["streamlit", "run", "streamlit_app.py"]
33
+
34
+ # docker build --progress=plain --tag selenium:latest .
35
+ # docker run -ti -p 8501:8501 --rm selenium:latest /bin/bash
36
+ # docker run -ti -p 8501:8501 --rm selenium:latest
37
+ # docker run -ti -p 8501:8501 -v ${pwd}:/app --rm selenium:latest
38
+ # docker run -ti -p 8501:8501 -v ${pwd}:/app --rm selenium:latest /bin/bash
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Ashok_kumar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,11 +1,139 @@
1
- ---
2
- title: Youtube-data Scraper
3
- emoji: 📈
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scraping YouTube Data
2
+ # Scraping the YouTube Video Likes using Selenium and Python & Creating Web Application (Streamlit)
3
+
4
+ **Introduction to Web Scraping:**
5
+
6
+ Web scraping is the automated process of extracting information or data from websites. It involves writing a script or using software to access and gather data from web pages, transforming unstructured data on the web into a structured format that can be analyzed, stored, or used in various applications.
7
+
8
+ **Web Scraping Process:**
9
+
10
+ Access Websites: A script or program accesses web pages, mimicking human browsing behavior.
11
+ Retrieve Data: It extracts specific information from these web pages.
12
+ Organize Data: The extracted data is structured and saved in a usable format (like CSV, JSON, or a database).
13
+ Fetching Data: The process starts with a request to a website, retrieving the HTML content.
14
+ Parsing: The HTML content is parsed to identify and extract relevant information using techniques like Regular Expressions, XPath, or CSS selectors.
15
+ Data Extraction: The desired data, such as text, images, links, or tables, is extracted from the parsed HTML.
16
+ Storage/Analysis: Extracted data is stored locally or analyzed for insights, automation, or integration into other systems.
17
+
18
+ What is Selenium?
19
+
20
+ Selenium scraping refers to using the Selenium framework, primarily employed for automating web browsers, to extract data from websites. It's a powerful tool used in web scraping to simulate human interaction with a web page by controlling a browser programmatically.
21
+ Tools Required
22
+ To get started, ensure you have the following tools installed:
23
+ Python: A programming language used for scripting.
24
+ Selenium WebDriver: A tool for controlling web browsers programmatically.
25
+ Streamlit: It will help to deploy a App.
26
+
27
+ Here's how it works:
28
+ Automating Web Browsers: Selenium allows you to control a web browser (like Chrome, Firefox, or others) programmatically. It mimics human interaction by opening web pages, clicking buttons, filling forms, and navigating across different pages.
29
+
30
+ 2. Data Extraction: Once the browser is directed to a particular webpage, Selenium enables the extraction of desired data. This can include scraping text, images, tables, or any other content from the webpage.
31
+ 3. Scraping Dynamic Content: Selenium is particularly useful for scraping websites with dynamic content that can't be easily accessed using traditional scraping libraries.
32
+ 4. Complex Scraping Scenarios: Selenium is versatile and can handle complex scraping tasks that involve interactions such as login processes, submitting forms, scrolling through infinite scroll pages, or dealing with content behind logins or captchas.
33
+ Import Libraries:
34
+ import time
35
+ import pprint
36
+ import csv
37
+ import selenium
38
+ from selenium import webdriver
39
+ from selenium.webdriver.chrome.service import Service
40
+ from webdriver_manager.chrome import ChromeDriverManager
41
+ from selenium.webdriver.support.wait import WebDriverWait
42
+ from selenium.webdriver.common.by import By
43
+ from selenium.webdriver.chrome.options import Options
44
+ import csv
45
+ from youtube_comment_scraper_python import *
46
+ import pandas as pd
47
+ import plotly.express as px
48
+ import re
49
+ import streamlit as st
50
+ Kickstart with Selenium WebDriver:
51
+ The Selenium WebDriver is a key component of the Selenium framework, designed to facilitate the interaction between your code and web browsers. It allows you to automate the testing of web applications and perform web scraping tasks by controlling browsers programmatically.
52
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
53
+
54
+ url = st.text_input('Paste the Youtube Channel Link',"")
55
+ if not url:
56
+ st.warning('Please input a Link.')
57
+ st.stop()
58
+ st.success('Thank you for inputting a link.')
59
+ name = re.compile(r"[A-Z]\w+")
60
+ inp = name.findall(url)
61
+ out = inp[0]
62
+ st.write('Getting Data from', out, 'channel')
63
+
64
+ driver.get(url)
65
+ url = input('Enter Youtube Video Url- ')
66
+ driver.get(url)
67
+ # # "https://www.youtube.com/@YasoobKhalid/videos"
68
+ # channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
69
+ handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
70
+ subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text
71
+ WAIT_IN_SECONDS = 5
72
+ last_height = driver.execute_script("return document.documentElement.scrollHeight")
73
+
74
+ while True:
75
+ # Scroll to the bottom of page
76
+ driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
77
+ # Wait for new videos to show up
78
+ time.sleep(WAIT_IN_SECONDS)
79
+
80
+ # Calculate new document height and compare it with last height
81
+ new_height = driver.execute_script("return document.documentElement.scrollHeight")
82
+ if new_height == last_height:
83
+ break
84
+ last_height = new_height
85
+
86
+ thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
87
+ views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
88
+ titles = driver.find_elements(By.ID, "video-title")
89
+ links = driver.find_elements(By.ID, "video-title-link")
90
+ # likes = driver.find_elements(By.ID, "video-title-link-likes")
91
+ Extracting Channel Information:
92
+ YouTube channels hold a wealth of information, from engaging content to vital statistics that provide insights into their popularity. In this guide, we'll explore how to programmatically extract key details like the channel's title, views, thumbnail, and link using Python's web scraping tools.
93
+ Extracting the Title, Views, Thumbnail, Link of the YouTube channel
94
+ Channel Title: Locate the HTML element containing the channel's title.
95
+ Channel Views: Find and extract the total number of views the channel has amassed.
96
+ Thumbnail URL: Extract the URL of the channel's thumbnail image.
97
+ Channel Link: Obtain the link to the YouTube channel.
98
+
99
+ videos = []
100
+ for title, view, thumb, link in zip(titles, views, thumbnails, links):
101
+ video_dict = {
102
+ 'title': title.text,
103
+ 'views': view.text,
104
+ # 'likes': likes.text,
105
+ 'thumbnail': thumb.get_attribute('src'),
106
+ 'link': link.get_attribute('href')
107
+ }
108
+ videos.append(video_dict)
109
+
110
+ print(videos)
111
+ Storing Scraped Data in CSV format
112
+ videos is a list of dictionaries containing the data to be written to the variable to_csv.
113
+ csv.DictWriter is a class within Python's csv module that facilitates writing data from dictionaries into CSV files. It's particularly useful when you have data organized in a dictionary format and want to export it into a CSV file with well-defined headers.
114
+ The code uses the csv module to write data to a CSV file named data.csv.
115
+ Then, it utilizes the pandas library (pd) to read the CSV file into a pandas DataFrame (df) and write that DataFrame to an CSV file. abd read the file named people.csv using the pd.read_csv() method.
116
+
117
+ to_csv = videos
118
+ keys = to_csv[0].keys()
119
+
120
+ with open(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/data.csv', 'w', newline='', encoding='utf-8') as output_file:
121
+ dict_writer = csv.DictWriter(output_file, keys)
122
+ dict_writer.writeheader()
123
+ dict_writer.writerows(to_csv)
124
+ df = pd.read_csv(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/peop.csv')
125
+ st.dataframe(df)
126
+ Streamlit App Development and Deployment:
127
+ Streamlit is a Python library for creating web applications with minimal effort :D
128
+ Streamlit is a Python library for creating web applications with minimal effort:
129
+ Streamlit • A faster way to build and share data apps
130
+ Rapid Development: Enables building interactive web apps using simple Python scripts.
131
+ 2. Data Visualization: Seamlessly integrates with popular data science libraries like Pandas, Matplotlib, and Plotly for quick data visualization.
132
+ 3. Automatic Updates: Auto-refreshes the app when code changes are detected, providing a smooth development experience.
133
+ 4. Custom Components: Supports custom HTML, CSS, and JavaScript for advanced customization.
134
+ 5. Deployment: Supports deployment to various platforms, including Streamlit sharing, Heroku, or other cloud providers.
135
+
136
+ Scrapping YouTube Data using Selenium and Python - YouTube
137
+ Conclusion
138
+ Automating the extraction of YouTube channel details using Python and web scraping techniques can save time and provide valuable insights. By harnessing the power of libraries like Selenium you can effortlessly retrieve crucial statistics like the channel's title, views, thumbnail, and link for further analysis or integration into your projects.
139
+ Start exploring and extracting valuable data from YouTube channels effortlessly with Python!
app.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pprint
3
+ import csv
4
+ import selenium
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.service import Service
7
+ from webdriver_manager.chrome import ChromeDriverManager
8
+ from selenium.webdriver.support.wait import WebDriverWait
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.chrome.options import Options
11
+ import csv
12
+ from youtube_comment_scraper_python import *
13
+ import pandas as pd
14
+ import plotly.express as px
15
+ import re
16
+ import streamlit as st
17
+
18
+ st.title('Youtube WebScrap⛏️')
19
+
20
+ # # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------
21
+
22
+ chromedriver_autoinstaller.install()
23
+ # driver = webdriver.Chrome('/usr/bin/google-chrome')
24
+ chrome_path = '/usr/bin/google-chrome'
25
+ # Set up Chrome options if needed
26
+ chrome_options = webdriver.ChromeOptions()
27
+ # Create the WebDriver instance
28
+ chrome_options.binary_location = chrome_path
29
+ driver = webdriver.Chrome(executable_path=chrome_path, options=chrome_options)
30
+ # driver = webdriver.Chrome()
31
+ url = st.text_input('Paste the Youtube Channel Link',"")
32
+ if not url:
33
+ st.warning('Please input a Link.')
34
+ st.stop()
35
+ st.success('Thank you for inputting a link.')
36
+ # url ='https://www.youtube.com/@YasoobKhalid/videos'
37
+ name = re.compile(r"[A-Z]\w+")
38
+ inp = name.findall(url)
39
+ out = inp[0]
40
+ st.write('Getting Data from', out, 'channel')
41
+
42
+
43
+ driver.get(url)
44
+
45
+ url = input('Enter Youtube Video Url- ')
46
+ driver.get(url)
47
+ # # "https://www.youtube.com/@YasoobKhalid/videos"
48
+ # channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
49
+ handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
50
+ subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text
51
+
52
+ WAIT_IN_SECONDS = 5
53
+ last_height = driver.execute_script("return document.documentElement.scrollHeight")
54
+
55
+ while True:
56
+ # Scroll to the bottom of page
57
+ driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
58
+ # Wait for new videos to show up
59
+ time.sleep(WAIT_IN_SECONDS)
60
+
61
+ # Calculate new document height and compare it with last height
62
+ new_height = driver.execute_script("return document.documentElement.scrollHeight")
63
+ if new_height == last_height:
64
+ break
65
+ last_height = new_height
66
+
67
+
68
+ thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
69
+ views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
70
+ titles = driver.find_elements(By.ID, "video-title")
71
+ links = driver.find_elements(By.ID, "video-title-link")
72
+ # likes = driver.find_elements(By.ID, "video-title-link-likes")
73
+
74
+ videos = []
75
+ for title, view, thumb, link in zip(titles, views, thumbnails, links):
76
+ video_dict = {
77
+ 'title': title.text,
78
+ 'views': view.text,
79
+ # 'likes': likes.text,
80
+ 'thumbnail': thumb.get_attribute('src'),
81
+ 'link': link.get_attribute('href')
82
+ }
83
+ videos.append(video_dict)
84
+
85
+ print(videos)
86
+
87
+ to_csv = videos
88
+ keys = to_csv[0].keys()
89
+
90
+ with open(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/people.csv', 'w', newline='', encoding='utf-8') as output_file:
91
+ dict_writer = csv.DictWriter(output_file, keys)
92
+ dict_writer.writeheader()
93
+ dict_writer.writerows(to_csv)
94
+ df = pd.read_csv(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/people.csv')
95
+ st.dataframe(df)
apps/Youtube_Scraper.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pprint
3
+ import csv
4
+ import selenium
5
+ from selenium import webdriver
6
+ from selenium.webdriver.chrome.service import Service
7
+ from webdriver_manager.chrome import ChromeDriverManager
8
+ from selenium.webdriver.support.wait import WebDriverWait
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.chrome.options import Options
11
+ import csv
12
+ from youtube_comment_scraper_python import *
13
+ import pandas as pd
14
+ import plotly.express as px
15
+ import re
16
+ import streamlit as st
17
+
18
+ st.title('Youtube WebScrap⛏️')
19
+
20
+ # # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------
21
+
22
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
23
+
24
+ url = st.text_input('Paste the Youtube Channel Link',"")
25
+ if not url:
26
+ st.warning('Please input a Link.')
27
+ st.stop()
28
+ st.success('Thank you for inputting a link.')
29
+ # url ='https://www.youtube.com/@YasoobKhalid/videos'
30
+ name = re.compile(r"[A-Z]\w+")
31
+ inp = name.findall(url)
32
+ out = inp[0]
33
+ st.write('Getting Data from', out, 'channel')
34
+
35
+
36
+ driver.get(url)
37
+
38
+ url = input('Enter Youtube Video Url- ')
39
+ driver.get(url)
40
+ # # "https://www.youtube.com/@YasoobKhalid/videos"
41
+ # channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
42
+ handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
43
+ subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text
44
+
45
+ WAIT_IN_SECONDS = 5
46
+ last_height = driver.execute_script("return document.documentElement.scrollHeight")
47
+
48
+ while True:
49
+ # Scroll to the bottom of page
50
+ driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
51
+ # Wait for new videos to show up
52
+ time.sleep(WAIT_IN_SECONDS)
53
+
54
+ # Calculate new document height and compare it with last height
55
+ new_height = driver.execute_script("return document.documentElement.scrollHeight")
56
+ if new_height == last_height:
57
+ break
58
+ last_height = new_height
59
+
60
+
61
+ thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
62
+ views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
63
+ titles = driver.find_elements(By.ID, "video-title")
64
+ links = driver.find_elements(By.ID, "video-title-link")
65
+ # likes = driver.find_elements(By.ID, "video-title-link-likes")
66
+
67
+ videos = []
68
+ for title, view, thumb, link in zip(titles, views, thumbnails, links):
69
+ video_dict = {
70
+ 'title': title.text,
71
+ 'views': view.text,
72
+ # 'likes': likes.text,
73
+ 'thumbnail': thumb.get_attribute('src'),
74
+ 'link': link.get_attribute('href')
75
+ }
76
+ videos.append(video_dict)
77
+
78
+ print(videos)
79
+
80
+ to_csv = videos
81
+ keys = to_csv[0].keys()
82
+
83
+ with open(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/people.csv', 'w', newline='', encoding='utf-8') as output_file:
84
+ dict_writer = csv.DictWriter(output_file, keys)
85
+ dict_writer.writeheader()
86
+ dict_writer.writerows(to_csv)
87
+ df = pd.read_csv(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/people.csv')
88
+ st.dataframe(df)
89
+
apps/pages/Youtube_Comments_analysis.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pprint
3
+ import csv
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from webdriver_manager.chrome import ChromeDriverManager
7
+ from selenium.webdriver.common.by import By
8
+ import csv
9
+ from youtube_comment_scraper_python import *
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import re
13
+ import streamlit as st
14
+
15
+ st.markdown("# Page 3 🎉")
16
+ st.sidebar.markdown("# Page 3 🎉")
17
+
18
+ # url = input('Enter Youtube Video Url- ')
19
+ # youtube.open(url)
20
+ # youtube.keypress("pagedown")
21
+
22
+ # data = []
23
+ # currentpagesource=youtube.get_page_source()
24
+ # lastpagesource=''
25
+
26
+ # while(True):
27
+ # if(lastpagesource==currentpagesource):
28
+ # break
29
+
30
+ # lastpagesource=currentpagesource
31
+ # response=youtube.video_comments()
32
+
33
+ # for c in response['body']:
34
+ # data.append(c)
35
+
36
+ # youtube.scroll()
37
+ # currentpagesource=youtube.get_page_source()
38
+
39
+
40
+ # df = pd.DataFrame(data)
41
+
42
+ # df = df.replace('\n',' ', regex=True)
43
+
44
+ # df = df[['Comment', 'Likes']].drop_duplicates(keep="first")
45
+ # # df = df[['Likes']].drop_duplicates(keep="first")
46
+
47
+ # df.to_csv('output/data.csv',index=False)
48
+
49
+ # df.head()
apps/pages/Youtube_analysis.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pprint
3
+ import csv
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from webdriver_manager.chrome import ChromeDriverManager
7
+ from selenium.webdriver.common.by import By
8
+ import csv
9
+ from youtube_comment_scraper_python import *
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import re
13
+ import streamlit as st
14
+ import numpy as np
15
+
16
+ st.title('Youtube Channel Analysis📈')
17
+
18
+ df = pd.read_csv(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/people.csv')
19
+ st.dataframe(df)
20
+
21
+ count = st.slider('Select Lower Video Count', 0, len(df), 100)
22
+ st.write("You selected", count, 'Videos')
23
+
24
+
25
+ fig = px.bar(df[:count],
26
+ x="title",
27
+ y="views", height=1000
28
+ )
29
+ fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
30
+ # fig.update_yaxes(tickvals=['10k', '22k', '29k', '56k'])
31
+ tab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])
32
+ with tab1:
33
+ # Use the Streamlit theme.
34
+ # This is the default. So you can also omit the theme argument.
35
+ st.plotly_chart(fig, theme="streamlit", use_container_width=True)
36
+ with tab2:
37
+ # Use the native Plotly theme.
38
+ st.plotly_chart(fig, theme=None, use_container_width=True)
code/test.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ df = pd.read_csv(r'C:/Users/ashok/OneDrive/Desktop/WebScrap/Youtube/output/people.csv')
5
+ # st.dataframe(df)
6
+
7
+ # print(df[0:1,:])
8
+
9
+ # for i in range(len(df)):
10
+ # print(i)
code/youtube1.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pprint
3
+ import csv
4
+ from selenium import webdriver
5
+ from selenium.webdriver.chrome.service import Service
6
+ from webdriver_manager.chrome import ChromeDriverManager
7
+ from selenium.webdriver.common.by import By
8
+ import csv
9
+ from youtube_comment_scraper_python import *
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import re
13
+ import streamlit as st
14
+
15
+ st.title('Youtube Channel Analysis')
16
+ st.write('Youtube WebScrap')
17
+
18
+
19
+ # # ------------------------------------------------------------------------------CHANNEL DATA------------------------------------------------------------------------
20
+
21
+ driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
22
+
23
+
24
+ url = st.text_input('Paste the Youtube Channel Link',"")
25
+ if not url:
26
+ st.warning('Please input a Link.')
27
+ st.stop()
28
+ st.success('Thank you for inputting a link.')
29
+ # url ='https://www.youtube.com/@YasoobKhalid/videos'
30
+ name = re.compile(r"[A-Z]\w+")
31
+ inp = name.findall(url)
32
+ out = inp[0]
33
+ st.write('Getting Data from', out, 'channel')
34
+ driver.get(url)
35
+
36
+ # url = input('Enter Youtube Video Url- ')
37
+ # driver.get(url)
38
+ # # "https://www.youtube.com/@YasoobKhalid/videos"
39
+ # channel_title = driver.find_element(By.XPATH, '//yt-formatted-string[contains(@class, "ytd-channel-name")]').text
40
+ handle = driver.find_element(By.XPATH, '//yt-formatted-string[@id="channel-handle"]').text
41
+ subscriber_count = driver.find_element(By.XPATH, '//yt-formatted-string[@id="subscriber-count"]').text
42
+
43
+ WAIT_IN_SECONDS = 5
44
+ last_height = driver.execute_script("return document.documentElement.scrollHeight")
45
+
46
+ while True:
47
+ # Scroll to the bottom of page
48
+ driver.execute_script("window.scrollTo(0, arguments[0]);", last_height)
49
+ # Wait for new videos to show up
50
+ time.sleep(WAIT_IN_SECONDS)
51
+
52
+ # Calculate new document height and compare it with last height
53
+ new_height = driver.execute_script("return document.documentElement.scrollHeight")
54
+ if new_height == last_height:
55
+ break
56
+ last_height = new_height
57
+
58
+
59
+ thumbnails = driver.find_elements(By.XPATH, '//a[@id="thumbnail"]/yt-image/img')
60
+ views = driver.find_elements(By.XPATH,'//div[@id="metadata-line"]/span[1]')
61
+ titles = driver.find_elements(By.ID, "video-title")
62
+ links = driver.find_elements(By.ID, "video-title-link")
63
+ # likes = driver.find_elements(By.ID, "video-title-link-likes")
64
+
65
+ videos = []
66
+ for title, view, thumb, link in zip(titles, views, thumbnails, links):
67
+ video_dict = {
68
+ 'title': title.text,
69
+ 'views': view.text,
70
+ # 'likes': likes.text,
71
+ 'thumbnail': thumb.get_attribute('src'),
72
+ 'link': link.get_attribute('href')
73
+ }
74
+ videos.append(video_dict)
75
+
76
+ print(videos)
77
+
78
+ to_csv = videos
79
+ keys = to_csv[0].keys()
80
+
81
+ with open('output/people.csv', 'w', newline='', encoding='utf-8') as output_file:
82
+ dict_writer = csv.DictWriter(output_file, keys)
83
+ dict_writer.writeheader()
84
+ dict_writer.writerows(to_csv)
85
+ df = pd.read_csv('output/people.csv')
86
+ st.dataframe(df)
87
+
88
+ count = st.slider('Select Lower Video Count', 0, 607, 100)
89
+ st.write("You selected", count, 'Videos')
90
+
91
+ fig = px.bar(df,
92
+ x="title",
93
+ y="views", height=600
94
+ )
95
+ fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False)
96
+ # fig.update_yaxes(tickvals=['10k', '22k', '29k', '56k'])
97
+ tab1, tab2 = st.tabs(["Streamlit theme (default)", "Plotly native theme"])
98
+ with tab1:
99
+ # Use the Streamlit theme.
100
+ # This is the default. So you can also omit the theme argument.
101
+ st.plotly_chart(fig, theme="streamlit", use_container_width=True)
102
+ with tab2:
103
+ # Use the native Plotly theme.
104
+ st.plotly_chart(fig, theme=None, use_container_width=True)
105
+
106
+ # ----------------------------------------------------------------------------COMMENTS------------------------------------------------------------------------------
107
+
108
+
109
+ # url = input('Enter Youtube Video Url- ')
110
+ # youtube.open(url)
111
+ # youtube.keypress("pagedown")
112
+
113
+ # data = []
114
+ # currentpagesource=youtube.get_page_source()
115
+ # lastpagesource=''
116
+
117
+ # while(True):
118
+ # if(lastpagesource==currentpagesource):
119
+ # break
120
+
121
+ # lastpagesource=currentpagesource
122
+ # response=youtube.video_comments()
123
+
124
+ # for c in response['body']:
125
+ # data.append(c)
126
+
127
+ # youtube.scroll()
128
+ # currentpagesource=youtube.get_page_source()
129
+
130
+
131
+ # df = pd.DataFrame(data)
132
+
133
+ # df = df.replace('\n',' ', regex=True)
134
+
135
+ # df = df[['Comment', 'Likes']].drop_duplicates(keep="first")
136
+ # # df = df[['Likes']].drop_duplicates(keep="first")
137
+
138
+ # df.to_csv('output/data.csv',index=False)
139
+
140
+ # df.head()
code/youtube2.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup #for scraping
2
+ import requests #required for reading the file
3
+ import pandas as pd #(optional) Pandas for dataframes
4
+ import json #(optional) If you want to export json
5
+ import os
6
+
7
+
8
+ url = input('Enter Youtube Video Url- ') # user input for the link
9
+ Vid={}
10
+ Link = url
11
+ source= requests.get(url).text
12
+ soup=BeautifulSoup(source,'lxml')
13
+ div_s = soup.findAll('div')
14
+ Title = div_s[1].find('span',class_='watch-title').text.strip()
15
+ Vid['Title']=Title
16
+ Vid['Link']=Link
17
+ Channel_name = div_s[1].find('a',class_="yt-uix-sessionlink spf-link").text.strip()
18
+ Channel_link = ('www.youtube.com'+div_s[1].find('a',class_="yt-uix-sessionlink spf-link").get('href'))
19
+ Subscribers = div_s[1].find('span',class_="yt-subscription-button-subscriber-count-branded-horizontal yt-subscriber-count").text.strip()
20
+ if len(Channel_name) ==0:
21
+ Channel_name ='None'
22
+ Channel_link = 'None'
23
+ Subscribers = 'None'
24
+ Vid['Channel']=Channel_name
25
+ Vid['Channel_link']=Channel_link
26
+ Vid['Channel_subscribers']=Subscribers
render.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ - name: web
3
+ env:
4
+ - key: CHROME_BIN
5
+ value: /usr/bin/google-chrome
6
+ # Use an official Python runtime as a parent image
7
+ FROM python:3.8-slim
8
+
9
+ # Set the working directory in the container
10
+ WORKDIR /app
11
+
12
+ # Copy the current directory contents into the container at /app
13
+ COPY . /app
14
+
15
+ # Install any needed packages specified in requirements.txt
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Install Chrome and ChromeDriver
19
+ RUN apt-get update && apt-get install -y \
20
+ wget \
21
+ unzip \
22
+ && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
23
+ && echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \
24
+ && apt-get update && apt-get install -y \
25
+ google-chrome-stable \
26
+ && wget https://chromedriver.storage.googleapis.com/94.0.4606.61/chromedriver_linux64.zip \
27
+ && unzip chromedriver_linux64.zip \
28
+ && mv chromedriver /usr/local/bin \
29
+ && rm chromedriver_linux64.zip
30
+
31
+ # Make port 80 available to the world outside this container
32
+ EXPOSE 80
33
+
34
+ # Define environment variable
35
+ ENV NAME World
36
+
37
+ # Run app.py when the container launches
38
+ CMD ["python", "app.py"]
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ selenium==4.9.1
2
+ webdriver-manager==3.8.6
3
+ youtube-comment-scraper-python==1.0.0
4
+ plotly==5.14.1
5
+ seleniumbase==4.14.12
6
+ undetected-chromedriver==3.4.7
7
+ streamlit==1.30.0
8
+ altair==5.0.1
9
+ chromedriver-autoinstaller==0.0.8