Spaces:

ganesh3
/

rag-youtube-assistant

Running

File size: 18,475 Bytes

dbd33b2
 
25b2b2b
dbd33b2
 
 
 
 
25b2b2b
dbd33b2
 
25b2b2b
507c938
 
25b2b2b
507c938
dbd33b2
a61b32e
 
 
dbd33b2
 
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
a61b32e
25b2b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd33b2
25b2b2b
 
 
 
 
 
dbd33b2
25b2b2b
dbd33b2
25b2b2b
 
dbd33b2
25b2b2b
 
dbd33b2
25b2b2b
 
 
 
dbd33b2
25b2b2b
507c938
25b2b2b
 
 
 
dbd33b2
25b2b2b
dbd33b2
25b2b2b
 
 
 
507c938
25b2b2b
 
 
507c938
25b2b2b
a61b32e
507c938
 
 
 
 
 
 
25b2b2b
507c938
 
a61b32e
 
 
 
 
 
 
507c938
 
 
 
 
 
 
 
a61b32e
 
507c938
a61b32e
507c938
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
25b2b2b
507c938
25b2b2b
507c938
 
 
 
25b2b2b
507c938
 
25b2b2b
507c938
 
25b2b2b
507c938
 
 
 
 
 
25b2b2b
a61b32e
25b2b2b
 
 
 
 
 
 
 
dbd33b2
 
 
25b2b2b
507c938
25b2b2b
 
 
 
 
 
dbd33b2
 
 
 
507c938
25b2b2b
 
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a61b32e
507c938
 
 
 
 
 
 
dbd33b2
 
507c938
dbd33b2
 
25b2b2b
dbd33b2
 
 
25b2b2b
507c938
 
 
 
dbd33b2
 
 
 
 
25b2b2b
507c938
 
 
 
dbd33b2
 
 
25b2b2b
507c938
 
 
 
 
dbd33b2
 
 
 
 
 
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd33b2
 
 
507c938
25b2b2b
 
 
507c938
25b2b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507c938
 
 
 
25b2b2b
507c938
25b2b2b
507c938
 
dbd33b2
 
 
 
507c938
 
 
 
 
dbd33b2
507c938
 
 
 
dbd33b2
507c938
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
 
 
 
 
 
 
dbd33b2
 
25b2b2b
 
 
dbd33b2

import streamlit as st
import pandas as pd
from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
from data_processor import DataProcessor
from database import DatabaseHandler
from rag import RAGSystem
from query_rewriter import QueryRewriter
from evaluation import EvaluationSystem
from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos
from sentence_transformers import SentenceTransformer
import os
import sys
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@st.cache_resource
def init_components():
    try:
        db_handler = DatabaseHandler()
        data_processor = DataProcessor()
        rag_system = RAGSystem(data_processor)
        query_rewriter = QueryRewriter()
        evaluation_system = EvaluationSystem(data_processor, db_handler)
        logger.info("Components initialized successfully")
        return db_handler, data_processor, rag_system, query_rewriter, evaluation_system
    except Exception as e:
        logger.error(f"Error initializing components: {str(e)}")
        st.error(f"Error initializing components: {str(e)}")
        st.error("Please check your configuration and ensure all services are running.")
        return None, None, None, None, None


def check_api_key():
    if test_api_key():
        st.success("YouTube API key is valid and working.")
    else:
        st.error("YouTube API key is invalid or not set. Please check your .env file.")
        new_api_key = st.text_input("Enter your YouTube API key:")
        if new_api_key:
            os.environ['YOUTUBE_API_KEY'] = new_api_key
            with open('.env', 'a') as f:
                f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
            st.success("API key saved. Reinitializing YouTube client...")
            get_youtube_client.cache_clear()  # Clear the cache to force reinitialization
            if test_api_key():
                st.success("YouTube client reinitialized successfully.")
            else:
                st.error("Failed to reinitialize YouTube client. Please check your API key.")
            st.experimental_rerun()

# LLM-as-a-judge prompt template
prompt_template = """
You are an expert evaluator for a Youtube transcript assistant.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in the following JSON format:

{{
  "Relevance": "NON_RELEVANT",
  "Explanation": "Your explanation here"
}}

OR

{{
  "Relevance": "PARTLY_RELEVANT",
  "Explanation": "Your explanation here"
}}

OR

{{
  "Relevance": "RELEVANT",
  "Explanation": "Your explanation here"
}}

Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
Do not include any text outside of this JSON object.
"""

def process_single_video(db_handler, data_processor, video_id, embedding_model):
    existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
    if existing_index:
        logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
        return existing_index

    transcript_data = get_transcript(video_id)
    if transcript_data is None:
        logger.error(f"Failed to retrieve transcript for video {video_id}")
        st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
        return None

    # Process the transcript
    processed_data = data_processor.process_transcript(video_id, transcript_data)
    if processed_data is None:
        logger.error(f"Failed to process transcript for video {video_id}")
        return None

    # Prepare video data for database insertion
    video_data = {
        'video_id': video_id,
        'title': transcript_data['metadata'].get('title', 'Unknown Title'),
        'author': transcript_data['metadata'].get('author', 'Unknown Author'),
        'upload_date': transcript_data['metadata'].get('upload_date', 'Unknown Date'),
        'view_count': int(transcript_data['metadata'].get('view_count', 0)),
        'like_count': int(transcript_data['metadata'].get('like_count', 0)),
        'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
        'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
        'transcript_content': processed_data['content']  # Add this line to include the transcript content
    }

    try:
        db_handler.add_video(video_data)
    except Exception as e:
        logger.error(f"Error adding video to database: {str(e)}")
        st.error(f"Error adding video {video_id} to database: {str(e)}")
        return None

    index_name = f"video_{video_id}_{embedding_model}".lower()
    try:
        index_name = data_processor.build_index(index_name)
        logger.info(f"Successfully built index: {index_name}")
    except Exception as e:
        logger.error(f"Error building index: {str(e)}")
        st.error(f"Error building index for video {video_id}: {str(e)}")
        return None
    
    embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
    
    video_db_record = db_handler.get_video_by_youtube_id(video_id)
    if video_db_record is None:
        logger.error(f"Failed to retrieve video record from database for video {video_id}")
        st.error(f"Failed to retrieve video record from database for video {video_id}")
        return None
    video_db_id = video_db_record[0]
    
    db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
    
    logger.info(f"Processed and indexed transcript for video {video_id}")
    st.success(f"Successfully processed and indexed transcript for video {video_id}")
    return index_name

def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
    indices = []
    for video_id in video_ids:
        index = process_single_video(db_handler, data_processor, video_id, embedding_model)
        if index:
            indices.append(index)
    logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
    st.success(f"Processed and indexed transcripts for {len(indices)} videos")
    return indices

def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
    if not index_name:
        st.warning(f"Video {video_id} has not been processed yet. Processing now...")
        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
        if not index_name:
            st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
            return False
    return True

def main():
    st.title("YouTube Transcript RAG System")

    check_api_key()

    components = init_components()
    if components:
        db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
    else:
        st.stop()
        
    tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])

    with tab1:
        st.header("RAG System")
        
        embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])
        
        st.subheader("Select a Video")
        videos = db_handler.get_all_videos()
        if not videos:
            st.warning("No videos available. Please process some videos first.")
        else:
            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
            
            channels = sorted(video_df['channel_name'].unique())
            selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
            
            if selected_channel != "All":
                video_df = video_df[video_df['channel_name'] == selected_channel]
            
            st.dataframe(video_df)
            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
            
            index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
            
            if index_name:
                st.success(f"Using index: {index_name}")
            else:
                st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
        
        st.subheader("Process New Video")
        input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
        input_value = st.text_input("Enter the URL or ID:")
        
        if st.button("Process"):
            with st.spinner("Processing..."):
                data_processor.set_embedding_model(embedding_model)
                if input_type == "Video URL":
                    video_id = extract_video_id(input_value)
                    if video_id:
                        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
                        if index_name is None:
                            st.error(f"Failed to process video {video_id}")
                        else:
                            st.success(f"Successfully processed video {video_id}")
                    else:
                        st.error("Failed to extract video ID from the URL")
                elif input_type == "Channel URL":
                    channel_videos = get_channel_videos(input_value)
                    if channel_videos:
                        index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
                        if not index_names:
                            st.error("Failed to process any videos from the channel")
                        else:
                            st.success(f"Successfully processed {len(index_names)} videos from the channel")
                    else:
                        st.error("Failed to retrieve videos from the channel")
                else:
                    index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
                    if index_name is None:
                        st.error(f"Failed to process video {input_value}")
                    else:
                        st.success(f"Successfully processed video {input_value}")
        
        st.subheader("Query the RAG System")
        query = st.text_input("Enter your query:")
        rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
        search_method = st.radio("Search method:", ["Hybrid", "Text-only", "Embedding-only"])

        if st.button("Search"):
            if not selected_video_id:
                st.error("Please select a video before searching.")
            else:
                with st.spinner("Searching..."):
                    rewritten_query = query
                    rewrite_prompt = ""
                    if rewrite_method == "Chain of Thought":
                        rewritten_query, rewrite_prompt = query_rewriter.rewrite_cot(query)
                    elif rewrite_method == "ReAct":
                        rewritten_query, rewrite_prompt = query_rewriter.rewrite_react(query)

                    st.subheader("Query Processing")
                    st.write("Original query:", query)
                    if rewrite_method != "None":
                        st.write("Rewritten query:", rewritten_query)
                        st.text_area("Query rewriting prompt:", rewrite_prompt, height=100)
                        if rewritten_query == query:
                            st.warning("Query rewriting failed. Using original query.")

                    search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
                    try:
                        if not index_name:
                            st.info("Building index for the selected video...")
                            index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
                            if not index_name:
                                st.error("Failed to build index for the selected video.")
                                return

                        response, final_prompt = rag_system.query(rewritten_query, search_method=search_method_map[search_method], index_name=index_name)
                        
                        st.subheader("RAG System Prompt")
                        if final_prompt:
                            st.text_area("Prompt sent to LLM:", final_prompt, height=300)
                        else:
                            st.warning("No prompt was generated. This might indicate an issue with the RAG system.")
                        
                        st.subheader("Response")
                        if response:
                            st.write(response)
                        else:
                            st.error("No response generated. Please try again or check the system logs for errors.")
                    except ValueError as e:
                        logger.error(f"Error during search: {str(e)}")
                        st.error(f"Error during search: {str(e)}")
                    except Exception as e:
                        logger.error(f"An unexpected error occurred: {str(e)}")
                        st.error(f"An unexpected error occurred: {str(e)}")

    with tab2:
        st.header("Ground Truth Generation")
        
        videos = db_handler.get_all_videos()
        if not videos:
            st.warning("No videos available. Please process some videos first.")
        else:
            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
            
            st.dataframe(video_df)
            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), 
                                             format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
                                             key="gt_video_select")
            
            if st.button("Generate Ground Truth for Selected Video"):
                if ensure_video_processed(db_handler, data_processor, selected_video_id, embedding_model):
                    with st.spinner("Generating ground truth..."):
                        ground_truth_df = generate_ground_truth(db_handler, data_processor, selected_video_id)
                        if ground_truth_df is not None:
                            st.dataframe(ground_truth_df)
                            csv = ground_truth_df.to_csv(index=False)
                            st.download_button(
                                label="Download Ground Truth CSV",
                                data=csv,
                                file_name=f"ground_truth_{selected_video_id}.csv",
                                mime="text/csv",
                            )
            if st.button("Generate Ground Truth for All Videos"):
                with st.spinner("Processing videos and generating ground truth..."):
                    for video_id in video_df['youtube_id']:
                        ensure_video_processed(db_handler, data_processor, video_id, embedding_model)
                    ground_truth_df = generate_ground_truth_for_all_videos(db_handler, data_processor)
                    if ground_truth_df is not None:
                        st.dataframe(ground_truth_df)
                        csv = ground_truth_df.to_csv(index=False)
                        st.download_button(
                            label="Download Ground Truth CSV (All Videos)",
                            data=csv,
                            file_name="ground_truth_all_videos.csv",
                            mime="text/csv",
                        )

    with tab3:
        st.header("RAG Evaluation")

        try:
            ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
            ground_truth_available = True
        except FileNotFoundError:
            ground_truth_available = False

        if ground_truth_available:
            st.write("Evaluation will be run on the following ground truth data:")
            st.dataframe(ground_truth_df)
            st.info("The evaluation will use this ground truth data to assess the performance of the RAG system.")

            sample_size = st.number_input("Enter sample size for evaluation:", min_value=1, max_value=len(ground_truth_df), value=min(200, len(ground_truth_df)))
            
            if st.button("Run Evaluation"):
                with st.spinner("Running evaluation..."):
                    evaluation_results = evaluation_system.evaluate_rag(rag_system, 'data/ground-truth-retrieval.csv', sample_size, prompt_template)
                    if evaluation_results:
                        st.write("Evaluation Results:")
                        st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
        else:
            st.warning("No ground truth data available. Please generate ground truth data first.")
            st.button("Run Evaluation", disabled=True)

        if not ground_truth_available:
            st.subheader("Generate Ground Truth")
            st.write("You need to generate ground truth data before running the evaluation.")
            if st.button("Go to Ground Truth Generation"):
                st.session_state.active_tab = "Ground Truth Generation"
                st.experimental_rerun()

if __name__ == "__main__":
    if not initialize_youtube_api():
        logger.error("Failed to initialize YouTube API. Exiting.")
        sys.exit(1)
    main()