Spaces:

ganesh3
/

rag-youtube-assistant

Running

App Files Files Community

rag-youtube-assistant / app /main.py

ganesh3

forth commit

a61b32e 9 months ago

raw

history blame

18.5 kB

	import streamlit as st
	import pandas as pd
	from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
	from data_processor import DataProcessor
	from database import DatabaseHandler
	from rag import RAGSystem
	from query_rewriter import QueryRewriter
	from evaluation import EvaluationSystem
	from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos
	from sentence_transformers import SentenceTransformer
	import os
	import sys
	import logging

	logging.basicConfig(level=logging.DEBUG)
	logger = logging.getLogger(__name__)

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	@st.cache_resource
	def init_components():
	try:
	db_handler = DatabaseHandler()
	data_processor = DataProcessor()
	rag_system = RAGSystem(data_processor)
	query_rewriter = QueryRewriter()
	evaluation_system = EvaluationSystem(data_processor, db_handler)
	logger.info("Components initialized successfully")
	return db_handler, data_processor, rag_system, query_rewriter, evaluation_system
	except Exception as e:
	logger.error(f"Error initializing components: {str(e)}")
	st.error(f"Error initializing components: {str(e)}")
	st.error("Please check your configuration and ensure all services are running.")
	return None, None, None, None, None


	def check_api_key():
	if test_api_key():
	st.success("YouTube API key is valid and working.")
	else:
	st.error("YouTube API key is invalid or not set. Please check your .env file.")
	new_api_key = st.text_input("Enter your YouTube API key:")
	if new_api_key:
	os.environ['YOUTUBE_API_KEY'] = new_api_key
	with open('.env', 'a') as f:
	f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
	st.success("API key saved. Reinitializing YouTube client...")
	get_youtube_client.cache_clear() # Clear the cache to force reinitialization
	if test_api_key():
	st.success("YouTube client reinitialized successfully.")
	else:
	st.error("Failed to reinitialize YouTube client. Please check your API key.")
	st.experimental_rerun()

	# LLM-as-a-judge prompt template
	prompt_template = """
	You are an expert evaluator for a Youtube transcript assistant.
	Your task is to analyze the relevance of the generated answer to the given question.
	Based on the relevance of the generated answer, you will classify it
	as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

	Here is the data for evaluation:

	Question: {question}
	Generated Answer: {answer_llm}

	Please analyze the content and context of the generated answer in relation to the question
	and provide your evaluation in the following JSON format:

	{{
	"Relevance": "NON_RELEVANT",
	"Explanation": "Your explanation here"
	}}

	OR

	{{
	"Relevance": "PARTLY_RELEVANT",
	"Explanation": "Your explanation here"
	}}

	OR

	{{
	"Relevance": "RELEVANT",
	"Explanation": "Your explanation here"
	}}

	Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
	Do not include any text outside of this JSON object.
	"""

	def process_single_video(db_handler, data_processor, video_id, embedding_model):
	existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
	if existing_index:
	logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
	return existing_index

	transcript_data = get_transcript(video_id)
	if transcript_data is None:
	logger.error(f"Failed to retrieve transcript for video {video_id}")
	st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
	return None

	# Process the transcript
	processed_data = data_processor.process_transcript(video_id, transcript_data)
	if processed_data is None:
	logger.error(f"Failed to process transcript for video {video_id}")
	return None

	# Prepare video data for database insertion
	video_data = {
	'video_id': video_id,
	'title': transcript_data['metadata'].get('title', 'Unknown Title'),
	'author': transcript_data['metadata'].get('author', 'Unknown Author'),
	'upload_date': transcript_data['metadata'].get('upload_date', 'Unknown Date'),
	'view_count': int(transcript_data['metadata'].get('view_count', 0)),
	'like_count': int(transcript_data['metadata'].get('like_count', 0)),
	'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
	'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
	'transcript_content': processed_data['content'] # Add this line to include the transcript content
	}

	try:
	db_handler.add_video(video_data)
	except Exception as e:
	logger.error(f"Error adding video to database: {str(e)}")
	st.error(f"Error adding video {video_id} to database: {str(e)}")
	return None

	index_name = f"video_{video_id}_{embedding_model}".lower()
	try:
	index_name = data_processor.build_index(index_name)
	logger.info(f"Successfully built index: {index_name}")
	except Exception as e:
	logger.error(f"Error building index: {str(e)}")
	st.error(f"Error building index for video {video_id}: {str(e)}")
	return None

	embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")

	video_db_record = db_handler.get_video_by_youtube_id(video_id)
	if video_db_record is None:
	logger.error(f"Failed to retrieve video record from database for video {video_id}")
	st.error(f"Failed to retrieve video record from database for video {video_id}")
	return None
	video_db_id = video_db_record[0]

	db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)

	logger.info(f"Processed and indexed transcript for video {video_id}")
	st.success(f"Successfully processed and indexed transcript for video {video_id}")
	return index_name

	def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
	indices = []
	for video_id in video_ids:
	index = process_single_video(db_handler, data_processor, video_id, embedding_model)
	if index:
	indices.append(index)
	logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
	st.success(f"Processed and indexed transcripts for {len(indices)} videos")
	return indices

	def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
	index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
	if not index_name:
	st.warning(f"Video {video_id} has not been processed yet. Processing now...")
	index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
	if not index_name:
	st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
	return False
	return True

	def main():
	st.title("YouTube Transcript RAG System")

	check_api_key()

	components = init_components()
	if components:
	db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
	else:
	st.stop()

	tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])

	with tab1:
	st.header("RAG System")

	embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])

	st.subheader("Select a Video")
	videos = db_handler.get_all_videos()
	if not videos:
	st.warning("No videos available. Please process some videos first.")
	else:
	video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])

	channels = sorted(video_df['channel_name'].unique())
	selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)

	if selected_channel != "All":
	video_df = video_df[video_df['channel_name'] == selected_channel]

	st.dataframe(video_df)
	selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])

	index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)

	if index_name:
	st.success(f"Using index: {index_name}")
	else:
	st.warning("No index found for the selected video and embedding model. The index will be built when you search.")

	st.subheader("Process New Video")
	input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
	input_value = st.text_input("Enter the URL or ID:")

	if st.button("Process"):
	with st.spinner("Processing..."):
	data_processor.set_embedding_model(embedding_model)
	if input_type == "Video URL":
	video_id = extract_video_id(input_value)
	if video_id:
	index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
	if index_name is None:
	st.error(f"Failed to process video {video_id}")
	else:
	st.success(f"Successfully processed video {video_id}")
	else:
	st.error("Failed to extract video ID from the URL")
	elif input_type == "Channel URL":
	channel_videos = get_channel_videos(input_value)
	if channel_videos:
	index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
	if not index_names:
	st.error("Failed to process any videos from the channel")
	else:
	st.success(f"Successfully processed {len(index_names)} videos from the channel")
	else:
	st.error("Failed to retrieve videos from the channel")
	else:
	index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
	if index_name is None:
	st.error(f"Failed to process video {input_value}")
	else:
	st.success(f"Successfully processed video {input_value}")

	st.subheader("Query the RAG System")
	query = st.text_input("Enter your query:")
	rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
	search_method = st.radio("Search method:", ["Hybrid", "Text-only", "Embedding-only"])

	if st.button("Search"):
	if not selected_video_id:
	st.error("Please select a video before searching.")
	else:
	with st.spinner("Searching..."):
	rewritten_query = query
	rewrite_prompt = ""
	if rewrite_method == "Chain of Thought":
	rewritten_query, rewrite_prompt = query_rewriter.rewrite_cot(query)
	elif rewrite_method == "ReAct":
	rewritten_query, rewrite_prompt = query_rewriter.rewrite_react(query)

	st.subheader("Query Processing")
	st.write("Original query:", query)
	if rewrite_method != "None":
	st.write("Rewritten query:", rewritten_query)
	st.text_area("Query rewriting prompt:", rewrite_prompt, height=100)
	if rewritten_query == query:
	st.warning("Query rewriting failed. Using original query.")

	search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
	try:
	if not index_name:
	st.info("Building index for the selected video...")
	index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
	if not index_name:
	st.error("Failed to build index for the selected video.")
	return

	response, final_prompt = rag_system.query(rewritten_query, search_method=search_method_map[search_method], index_name=index_name)

	st.subheader("RAG System Prompt")
	if final_prompt:
	st.text_area("Prompt sent to LLM:", final_prompt, height=300)
	else:
	st.warning("No prompt was generated. This might indicate an issue with the RAG system.")

	st.subheader("Response")
	if response:
	st.write(response)
	else:
	st.error("No response generated. Please try again or check the system logs for errors.")
	except ValueError as e:
	logger.error(f"Error during search: {str(e)}")
	st.error(f"Error during search: {str(e)}")
	except Exception as e:
	logger.error(f"An unexpected error occurred: {str(e)}")
	st.error(f"An unexpected error occurred: {str(e)}")

	with tab2:
	st.header("Ground Truth Generation")

	videos = db_handler.get_all_videos()
	if not videos:
	st.warning("No videos available. Please process some videos first.")
	else:
	video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])

	st.dataframe(video_df)
	selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(),
	format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
	key="gt_video_select")

	if st.button("Generate Ground Truth for Selected Video"):
	if ensure_video_processed(db_handler, data_processor, selected_video_id, embedding_model):
	with st.spinner("Generating ground truth..."):
	ground_truth_df = generate_ground_truth(db_handler, data_processor, selected_video_id)
	if ground_truth_df is not None:
	st.dataframe(ground_truth_df)
	csv = ground_truth_df.to_csv(index=False)
	st.download_button(
	label="Download Ground Truth CSV",
	data=csv,
	file_name=f"ground_truth_{selected_video_id}.csv",
	mime="text/csv",
	)
	if st.button("Generate Ground Truth for All Videos"):
	with st.spinner("Processing videos and generating ground truth..."):
	for video_id in video_df['youtube_id']:
	ensure_video_processed(db_handler, data_processor, video_id, embedding_model)
	ground_truth_df = generate_ground_truth_for_all_videos(db_handler, data_processor)
	if ground_truth_df is not None:
	st.dataframe(ground_truth_df)
	csv = ground_truth_df.to_csv(index=False)
	st.download_button(
	label="Download Ground Truth CSV (All Videos)",
	data=csv,
	file_name="ground_truth_all_videos.csv",
	mime="text/csv",
	)

	with tab3:
	st.header("RAG Evaluation")

	try:
	ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
	ground_truth_available = True
	except FileNotFoundError:
	ground_truth_available = False

	if ground_truth_available:
	st.write("Evaluation will be run on the following ground truth data:")
	st.dataframe(ground_truth_df)
	st.info("The evaluation will use this ground truth data to assess the performance of the RAG system.")

	sample_size = st.number_input("Enter sample size for evaluation:", min_value=1, max_value=len(ground_truth_df), value=min(200, len(ground_truth_df)))

	if st.button("Run Evaluation"):
	with st.spinner("Running evaluation..."):
	evaluation_results = evaluation_system.evaluate_rag(rag_system, 'data/ground-truth-retrieval.csv', sample_size, prompt_template)
	if evaluation_results:
	st.write("Evaluation Results:")
	st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
	else:
	st.warning("No ground truth data available. Please generate ground truth data first.")
	st.button("Run Evaluation", disabled=True)

	if not ground_truth_available:
	st.subheader("Generate Ground Truth")
	st.write("You need to generate ground truth data before running the evaluation.")
	if st.button("Go to Ground Truth Generation"):
	st.session_state.active_tab = "Ground Truth Generation"
	st.experimental_rerun()

	if __name__ == "__main__":
	if not initialize_youtube_api():
	logger.error("Failed to initialize YouTube API. Exiting.")
	sys.exit(1)
	main()