Spaces:
Runtime error
Runtime error
Commit
·
ea13678
1
Parent(s):
3704589
Create main_query_tube.py
Browse files- main_query_tube.py +74 -0
main_query_tube.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
!pip install -Uqq youtube-transcript-api
|
2 |
+
!pip install -Uqq pytube
|
3 |
+
!pip3 install -Uqq thirdai[neural_db]
|
4 |
+
|
5 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
6 |
+
import pandas as pd
|
7 |
+
from pytube import Playlist
|
8 |
+
import re
|
9 |
+
import time
|
10 |
+
|
11 |
+
def get_video_links_from_playlist(playlist_url):
|
12 |
+
playlist = Playlist(playlist_url)
|
13 |
+
|
14 |
+
video_links = [video.watch_url for video in playlist.videos]
|
15 |
+
return video_links
|
16 |
+
|
17 |
+
def extract_video_id(video_url):
|
18 |
+
# Regular expression pattern to match YouTube video ID
|
19 |
+
pattern = r"(?:v=|v\/|embed\/|youtu.be\/|\/v\/|\/e\/|watch\?v=|youtube.com\/user\/[^#]*#([^\/]*?\/)*)((\w|-){11})"
|
20 |
+
|
21 |
+
match = re.search(pattern, video_url)
|
22 |
+
if match:
|
23 |
+
return match.group(2)
|
24 |
+
else:
|
25 |
+
return None
|
26 |
+
|
27 |
+
def create_csv(transcript_lst):
|
28 |
+
|
29 |
+
flat_data = [item for sublist in transcript_lst for item in sublist]
|
30 |
+
df = pd.DataFrame(flat_data)
|
31 |
+
df['end_time'] = df['start'] + df['duration']
|
32 |
+
df = df[['text', 'start', 'duration', 'end_time', 'video_serial_number']]
|
33 |
+
return df
|
34 |
+
|
35 |
+
def create_data(playlist_link):
|
36 |
+
all_playlist_videos_lst = get_video_links_from_playlist(playlist_link)
|
37 |
+
# create individual video id list
|
38 |
+
video_id_lst = []
|
39 |
+
for video_link in all_playlist_videos_lst:
|
40 |
+
video_id_lst.append(extract_video_id(video_link))
|
41 |
+
# get transcripts for all video_ids
|
42 |
+
transcript_lst = []
|
43 |
+
for video_serial in range(len(video_id_lst)):
|
44 |
+
video_id = video_id_lst[video_serial]
|
45 |
+
try:
|
46 |
+
video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
47 |
+
new_transcript = []
|
48 |
+
for i in range(0, len(video_transcript), 4):
|
49 |
+
start_time = video_transcript[i]['start']
|
50 |
+
duration = 0
|
51 |
+
text = ''
|
52 |
+
for j in range(4):
|
53 |
+
index = i + j
|
54 |
+
if index < len(video_transcript):
|
55 |
+
text += video_transcript[index]['text']
|
56 |
+
text += ' '
|
57 |
+
duration += video_transcript[index]['duration']
|
58 |
+
dict = {'text' : text,
|
59 |
+
'start' : start_time,
|
60 |
+
'duration' : duration,}
|
61 |
+
'video_serial_number':video_serial+1}
|
62 |
+
# print('serail_number: ',video_serial)
|
63 |
+
new_transcript.append(dict)
|
64 |
+
transcript_lst.append(new_transcript)
|
65 |
+
except:
|
66 |
+
continue
|
67 |
+
|
68 |
+
# make dataframe from transcript list
|
69 |
+
if len(transcript_lst)!=0:
|
70 |
+
csv_file = create_csv(transcript_lst)
|
71 |
+
csv_file['text'] = csv_file['text'].str.replace(r'\n', ' ')
|
72 |
+
return csv_file
|
73 |
+
return 0
|
74 |
+
|