shauryat97 commited on
Commit
ea13678
·
1 Parent(s): 3704589

Create main_query_tube.py

Browse files
Files changed (1) hide show
  1. main_query_tube.py +74 -0
main_query_tube.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install -Uqq youtube-transcript-api
2
+ !pip install -Uqq pytube
3
+ !pip3 install -Uqq thirdai[neural_db]
4
+
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ import pandas as pd
7
+ from pytube import Playlist
8
+ import re
9
+ import time
10
+
11
+ def get_video_links_from_playlist(playlist_url):
12
+ playlist = Playlist(playlist_url)
13
+
14
+ video_links = [video.watch_url for video in playlist.videos]
15
+ return video_links
16
+
17
+ def extract_video_id(video_url):
18
+ # Regular expression pattern to match YouTube video ID
19
+ pattern = r"(?:v=|v\/|embed\/|youtu.be\/|\/v\/|\/e\/|watch\?v=|youtube.com\/user\/[^#]*#([^\/]*?\/)*)((\w|-){11})"
20
+
21
+ match = re.search(pattern, video_url)
22
+ if match:
23
+ return match.group(2)
24
+ else:
25
+ return None
26
+
27
+ def create_csv(transcript_lst):
28
+
29
+ flat_data = [item for sublist in transcript_lst for item in sublist]
30
+ df = pd.DataFrame(flat_data)
31
+ df['end_time'] = df['start'] + df['duration']
32
+ df = df[['text', 'start', 'duration', 'end_time', 'video_serial_number']]
33
+ return df
34
+
35
+ def create_data(playlist_link):
36
+ all_playlist_videos_lst = get_video_links_from_playlist(playlist_link)
37
+ # create individual video id list
38
+ video_id_lst = []
39
+ for video_link in all_playlist_videos_lst:
40
+ video_id_lst.append(extract_video_id(video_link))
41
+ # get transcripts for all video_ids
42
+ transcript_lst = []
43
+ for video_serial in range(len(video_id_lst)):
44
+ video_id = video_id_lst[video_serial]
45
+ try:
46
+ video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
47
+ new_transcript = []
48
+ for i in range(0, len(video_transcript), 4):
49
+ start_time = video_transcript[i]['start']
50
+ duration = 0
51
+ text = ''
52
+ for j in range(4):
53
+ index = i + j
54
+ if index < len(video_transcript):
55
+ text += video_transcript[index]['text']
56
+ text += ' '
57
+ duration += video_transcript[index]['duration']
58
+ dict = {'text' : text,
59
+ 'start' : start_time,
60
+ 'duration' : duration,}
61
+ 'video_serial_number':video_serial+1}
62
+ # print('serail_number: ',video_serial)
63
+ new_transcript.append(dict)
64
+ transcript_lst.append(new_transcript)
65
+ except:
66
+ continue
67
+
68
+ # make dataframe from transcript list
69
+ if len(transcript_lst)!=0:
70
+ csv_file = create_csv(transcript_lst)
71
+ csv_file['text'] = csv_file['text'].str.replace(r'\n', ' ')
72
+ return csv_file
73
+ return 0
74
+