rag-youtube-assistant / app /transcript_extractor.py
ganesh3's picture
first modification
dbd33b2
raw
history blame
2.83 kB
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
import re
import os
# Replace with your actual API key
API_KEY = os.environ.get('YOUTUBE_API_KEY', 'YOUR_API_KEY_HERE')
youtube = build('youtube', 'v3', developerKey=API_KEY)
def extract_video_id(url):
video_id_match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url)
if video_id_match:
return video_id_match.group(1)
return None
def get_video_metadata(video_id):
try:
request = youtube.videos().list(
part="snippet,contentDetails,statistics",
id=video_id
)
response = request.execute()
if 'items' in response and len(response['items']) > 0:
video = response['items'][0]
snippet = video['snippet']
return {
'title': snippet['title'],
'author': snippet['channelTitle'],
'upload_date': snippet['publishedAt'],
'view_count': video['statistics']['viewCount'],
'like_count': video['statistics'].get('likeCount', 'N/A'),
'comment_count': video['statistics'].get('commentCount', 'N/A'),
'duration': video['contentDetails']['duration']
}
else:
return None
except HttpError as e:
print(f"An HTTP error {e.resp.status} occurred: {e.content}")
return None
def get_transcript(video_id):
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
metadata = get_video_metadata(video_id)
return {
'transcript': transcript,
'metadata': metadata
}
except Exception as e:
print(f"Error extracting transcript for video {video_id}: {str(e)}")
return None
def get_channel_videos(channel_id):
try:
request = youtube.search().list(
part="id,snippet",
channelId=channel_id,
type="video",
maxResults=50 # Adjust as needed
)
response = request.execute()
videos = []
for item in response['items']:
videos.append({
'video_id': item['id']['videoId'],
'title': item['snippet']['title'],
'description': item['snippet']['description'],
'published_at': item['snippet']['publishedAt']
})
return videos
except HttpError as e:
print(f"An HTTP error {e.resp.status} occurred: {e.content}")
return []
def process_videos(video_ids):
transcripts = {}
for video_id in video_ids:
transcript_data = get_transcript(video_id)
if transcript_data:
transcripts[video_id] = transcript_data
return transcripts