File size: 18,475 Bytes
dbd33b2
 
25b2b2b
dbd33b2
 
 
 
 
25b2b2b
dbd33b2
 
25b2b2b
507c938
 
25b2b2b
507c938
dbd33b2
a61b32e
 
 
dbd33b2
 
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
a61b32e
25b2b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd33b2
25b2b2b
 
 
 
 
 
dbd33b2
25b2b2b
dbd33b2
25b2b2b
 
dbd33b2
25b2b2b
 
dbd33b2
25b2b2b
 
 
 
dbd33b2
25b2b2b
507c938
25b2b2b
 
 
 
dbd33b2
25b2b2b
dbd33b2
25b2b2b
 
 
 
507c938
25b2b2b
 
 
507c938
25b2b2b
a61b32e
507c938
 
 
 
 
 
 
25b2b2b
507c938
 
a61b32e
 
 
 
 
 
 
507c938
 
 
 
 
 
 
 
a61b32e
 
507c938
a61b32e
507c938
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
25b2b2b
507c938
25b2b2b
507c938
 
 
 
25b2b2b
507c938
 
25b2b2b
507c938
 
25b2b2b
507c938
 
 
 
 
 
25b2b2b
a61b32e
25b2b2b
 
 
 
 
 
 
 
dbd33b2
 
 
25b2b2b
507c938
25b2b2b
 
 
 
 
 
dbd33b2
 
 
 
507c938
25b2b2b
 
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a61b32e
507c938
 
 
 
 
 
 
dbd33b2
 
507c938
dbd33b2
 
25b2b2b
dbd33b2
 
 
25b2b2b
507c938
 
 
 
dbd33b2
 
 
 
 
25b2b2b
507c938
 
 
 
dbd33b2
 
 
25b2b2b
507c938
 
 
 
 
dbd33b2
 
 
 
 
 
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbd33b2
 
 
507c938
25b2b2b
 
 
507c938
25b2b2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507c938
 
 
 
25b2b2b
507c938
25b2b2b
507c938
 
dbd33b2
 
 
 
507c938
 
 
 
 
dbd33b2
507c938
 
 
 
dbd33b2
507c938
 
 
 
25b2b2b
507c938
 
 
 
 
 
 
 
 
 
 
 
 
dbd33b2
 
25b2b2b
 
 
dbd33b2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
import streamlit as st
import pandas as pd
from transcript_extractor import get_transcript, get_youtube_client, extract_video_id, get_channel_videos, test_api_key, initialize_youtube_api
from data_processor import DataProcessor
from database import DatabaseHandler
from rag import RAGSystem
from query_rewriter import QueryRewriter
from evaluation import EvaluationSystem
from generate_ground_truth import generate_ground_truth, generate_ground_truth_for_all_videos
from sentence_transformers import SentenceTransformer
import os
import sys
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@st.cache_resource
def init_components():
    try:
        db_handler = DatabaseHandler()
        data_processor = DataProcessor()
        rag_system = RAGSystem(data_processor)
        query_rewriter = QueryRewriter()
        evaluation_system = EvaluationSystem(data_processor, db_handler)
        logger.info("Components initialized successfully")
        return db_handler, data_processor, rag_system, query_rewriter, evaluation_system
    except Exception as e:
        logger.error(f"Error initializing components: {str(e)}")
        st.error(f"Error initializing components: {str(e)}")
        st.error("Please check your configuration and ensure all services are running.")
        return None, None, None, None, None


def check_api_key():
    if test_api_key():
        st.success("YouTube API key is valid and working.")
    else:
        st.error("YouTube API key is invalid or not set. Please check your .env file.")
        new_api_key = st.text_input("Enter your YouTube API key:")
        if new_api_key:
            os.environ['YOUTUBE_API_KEY'] = new_api_key
            with open('.env', 'a') as f:
                f.write(f"\nYOUTUBE_API_KEY={new_api_key}")
            st.success("API key saved. Reinitializing YouTube client...")
            get_youtube_client.cache_clear()  # Clear the cache to force reinitialization
            if test_api_key():
                st.success("YouTube client reinitialized successfully.")
            else:
                st.error("Failed to reinitialize YouTube client. Please check your API key.")
            st.experimental_rerun()

# LLM-as-a-judge prompt template
prompt_template = """
You are an expert evaluator for a Youtube transcript assistant.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in the following JSON format:

{{
  "Relevance": "NON_RELEVANT",
  "Explanation": "Your explanation here"
}}

OR

{{
  "Relevance": "PARTLY_RELEVANT",
  "Explanation": "Your explanation here"
}}

OR

{{
  "Relevance": "RELEVANT",
  "Explanation": "Your explanation here"
}}

Ensure your response is a valid JSON object with these exact keys and one of the three exact values for "Relevance".
Do not include any text outside of this JSON object.
"""

def process_single_video(db_handler, data_processor, video_id, embedding_model):
    existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
    if existing_index:
        logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
        return existing_index

    transcript_data = get_transcript(video_id)
    if transcript_data is None:
        logger.error(f"Failed to retrieve transcript for video {video_id}")
        st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
        return None

    # Process the transcript
    processed_data = data_processor.process_transcript(video_id, transcript_data)
    if processed_data is None:
        logger.error(f"Failed to process transcript for video {video_id}")
        return None

    # Prepare video data for database insertion
    video_data = {
        'video_id': video_id,
        'title': transcript_data['metadata'].get('title', 'Unknown Title'),
        'author': transcript_data['metadata'].get('author', 'Unknown Author'),
        'upload_date': transcript_data['metadata'].get('upload_date', 'Unknown Date'),
        'view_count': int(transcript_data['metadata'].get('view_count', 0)),
        'like_count': int(transcript_data['metadata'].get('like_count', 0)),
        'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
        'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
        'transcript_content': processed_data['content']  # Add this line to include the transcript content
    }

    try:
        db_handler.add_video(video_data)
    except Exception as e:
        logger.error(f"Error adding video to database: {str(e)}")
        st.error(f"Error adding video {video_id} to database: {str(e)}")
        return None

    index_name = f"video_{video_id}_{embedding_model}".lower()
    try:
        index_name = data_processor.build_index(index_name)
        logger.info(f"Successfully built index: {index_name}")
    except Exception as e:
        logger.error(f"Error building index: {str(e)}")
        st.error(f"Error building index for video {video_id}: {str(e)}")
        return None
    
    embedding_model_id = db_handler.add_embedding_model(embedding_model, "Description of the model")
    
    video_db_record = db_handler.get_video_by_youtube_id(video_id)
    if video_db_record is None:
        logger.error(f"Failed to retrieve video record from database for video {video_id}")
        st.error(f"Failed to retrieve video record from database for video {video_id}")
        return None
    video_db_id = video_db_record[0]
    
    db_handler.add_elasticsearch_index(video_db_id, index_name, embedding_model_id)
    
    logger.info(f"Processed and indexed transcript for video {video_id}")
    st.success(f"Successfully processed and indexed transcript for video {video_id}")
    return index_name

def process_multiple_videos(db_handler, data_processor, video_ids, embedding_model):
    indices = []
    for video_id in video_ids:
        index = process_single_video(db_handler, data_processor, video_id, embedding_model)
        if index:
            indices.append(index)
    logger.info(f"Processed and indexed transcripts for {len(indices)} videos")
    st.success(f"Processed and indexed transcripts for {len(indices)} videos")
    return indices

def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
    if not index_name:
        st.warning(f"Video {video_id} has not been processed yet. Processing now...")
        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
        if not index_name:
            st.error(f"Failed to process video {video_id}. Please check the logs for more information.")
            return False
    return True

def main():
    st.title("YouTube Transcript RAG System")

    check_api_key()

    components = init_components()
    if components:
        db_handler, data_processor, rag_system, query_rewriter, evaluation_system = components
    else:
        st.stop()
        
    tab1, tab2, tab3 = st.tabs(["RAG System", "Ground Truth Generation", "Evaluation"])

    with tab1:
        st.header("RAG System")
        
        embedding_model = st.selectbox("Select embedding model:", ["multi-qa-MiniLM-L6-cos-v1", "all-mpnet-base-v2"])
        
        st.subheader("Select a Video")
        videos = db_handler.get_all_videos()
        if not videos:
            st.warning("No videos available. Please process some videos first.")
        else:
            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
            
            channels = sorted(video_df['channel_name'].unique())
            selected_channel = st.selectbox("Filter by Channel", ["All"] + channels)
            
            if selected_channel != "All":
                video_df = video_df[video_df['channel_name'] == selected_channel]
            
            st.dataframe(video_df)
            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
            
            index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
            
            if index_name:
                st.success(f"Using index: {index_name}")
            else:
                st.warning("No index found for the selected video and embedding model. The index will be built when you search.")
        
        st.subheader("Process New Video")
        input_type = st.radio("Select input type:", ["Video URL", "Channel URL", "YouTube ID"])
        input_value = st.text_input("Enter the URL or ID:")
        
        if st.button("Process"):
            with st.spinner("Processing..."):
                data_processor.set_embedding_model(embedding_model)
                if input_type == "Video URL":
                    video_id = extract_video_id(input_value)
                    if video_id:
                        index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
                        if index_name is None:
                            st.error(f"Failed to process video {video_id}")
                        else:
                            st.success(f"Successfully processed video {video_id}")
                    else:
                        st.error("Failed to extract video ID from the URL")
                elif input_type == "Channel URL":
                    channel_videos = get_channel_videos(input_value)
                    if channel_videos:
                        index_names = process_multiple_videos(db_handler, data_processor, [video['video_id'] for video in channel_videos], embedding_model)
                        if not index_names:
                            st.error("Failed to process any videos from the channel")
                        else:
                            st.success(f"Successfully processed {len(index_names)} videos from the channel")
                    else:
                        st.error("Failed to retrieve videos from the channel")
                else:
                    index_name = process_single_video(db_handler, data_processor, input_value, embedding_model)
                    if index_name is None:
                        st.error(f"Failed to process video {input_value}")
                    else:
                        st.success(f"Successfully processed video {input_value}")
        
        st.subheader("Query the RAG System")
        query = st.text_input("Enter your query:")
        rewrite_method = st.radio("Query rewriting method:", ["None", "Chain of Thought", "ReAct"])
        search_method = st.radio("Search method:", ["Hybrid", "Text-only", "Embedding-only"])

        if st.button("Search"):
            if not selected_video_id:
                st.error("Please select a video before searching.")
            else:
                with st.spinner("Searching..."):
                    rewritten_query = query
                    rewrite_prompt = ""
                    if rewrite_method == "Chain of Thought":
                        rewritten_query, rewrite_prompt = query_rewriter.rewrite_cot(query)
                    elif rewrite_method == "ReAct":
                        rewritten_query, rewrite_prompt = query_rewriter.rewrite_react(query)

                    st.subheader("Query Processing")
                    st.write("Original query:", query)
                    if rewrite_method != "None":
                        st.write("Rewritten query:", rewritten_query)
                        st.text_area("Query rewriting prompt:", rewrite_prompt, height=100)
                        if rewritten_query == query:
                            st.warning("Query rewriting failed. Using original query.")

                    search_method_map = {"Hybrid": "hybrid", "Text-only": "text", "Embedding-only": "embedding"}
                    try:
                        if not index_name:
                            st.info("Building index for the selected video...")
                            index_name = process_single_video(db_handler, data_processor, selected_video_id, embedding_model)
                            if not index_name:
                                st.error("Failed to build index for the selected video.")
                                return

                        response, final_prompt = rag_system.query(rewritten_query, search_method=search_method_map[search_method], index_name=index_name)
                        
                        st.subheader("RAG System Prompt")
                        if final_prompt:
                            st.text_area("Prompt sent to LLM:", final_prompt, height=300)
                        else:
                            st.warning("No prompt was generated. This might indicate an issue with the RAG system.")
                        
                        st.subheader("Response")
                        if response:
                            st.write(response)
                        else:
                            st.error("No response generated. Please try again or check the system logs for errors.")
                    except ValueError as e:
                        logger.error(f"Error during search: {str(e)}")
                        st.error(f"Error during search: {str(e)}")
                    except Exception as e:
                        logger.error(f"An unexpected error occurred: {str(e)}")
                        st.error(f"An unexpected error occurred: {str(e)}")

    with tab2:
        st.header("Ground Truth Generation")
        
        videos = db_handler.get_all_videos()
        if not videos:
            st.warning("No videos available. Please process some videos first.")
        else:
            video_df = pd.DataFrame(videos, columns=['youtube_id', 'title', 'channel_name', 'upload_date'])
            
            st.dataframe(video_df)
            selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), 
                                             format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0],
                                             key="gt_video_select")
            
            if st.button("Generate Ground Truth for Selected Video"):
                if ensure_video_processed(db_handler, data_processor, selected_video_id, embedding_model):
                    with st.spinner("Generating ground truth..."):
                        ground_truth_df = generate_ground_truth(db_handler, data_processor, selected_video_id)
                        if ground_truth_df is not None:
                            st.dataframe(ground_truth_df)
                            csv = ground_truth_df.to_csv(index=False)
                            st.download_button(
                                label="Download Ground Truth CSV",
                                data=csv,
                                file_name=f"ground_truth_{selected_video_id}.csv",
                                mime="text/csv",
                            )
            if st.button("Generate Ground Truth for All Videos"):
                with st.spinner("Processing videos and generating ground truth..."):
                    for video_id in video_df['youtube_id']:
                        ensure_video_processed(db_handler, data_processor, video_id, embedding_model)
                    ground_truth_df = generate_ground_truth_for_all_videos(db_handler, data_processor)
                    if ground_truth_df is not None:
                        st.dataframe(ground_truth_df)
                        csv = ground_truth_df.to_csv(index=False)
                        st.download_button(
                            label="Download Ground Truth CSV (All Videos)",
                            data=csv,
                            file_name="ground_truth_all_videos.csv",
                            mime="text/csv",
                        )

    with tab3:
        st.header("RAG Evaluation")

        try:
            ground_truth_df = pd.read_csv('data/ground-truth-retrieval.csv')
            ground_truth_available = True
        except FileNotFoundError:
            ground_truth_available = False

        if ground_truth_available:
            st.write("Evaluation will be run on the following ground truth data:")
            st.dataframe(ground_truth_df)
            st.info("The evaluation will use this ground truth data to assess the performance of the RAG system.")

            sample_size = st.number_input("Enter sample size for evaluation:", min_value=1, max_value=len(ground_truth_df), value=min(200, len(ground_truth_df)))
            
            if st.button("Run Evaluation"):
                with st.spinner("Running evaluation..."):
                    evaluation_results = evaluation_system.evaluate_rag(rag_system, 'data/ground-truth-retrieval.csv', sample_size, prompt_template)
                    if evaluation_results:
                        st.write("Evaluation Results:")
                        st.dataframe(pd.DataFrame(evaluation_results, columns=['Video ID', 'Question', 'Answer', 'Relevance', 'Explanation']))
        else:
            st.warning("No ground truth data available. Please generate ground truth data first.")
            st.button("Run Evaluation", disabled=True)

        if not ground_truth_available:
            st.subheader("Generate Ground Truth")
            st.write("You need to generate ground truth data before running the evaluation.")
            if st.button("Go to Ground Truth Generation"):
                st.session_state.active_tab = "Ground Truth Generation"
                st.experimental_rerun()

if __name__ == "__main__":
    if not initialize_youtube_api():
        logger.error("Failed to initialize YouTube API. Exiting.")
        sys.exit(1)
    main()