Spaces:

ganesh3
/

rag-youtube-assistant

Sleeping

App Files Files Community

ganesh3 commited on Oct 20, 2024

Commit

a61b32e

1 Parent(s): 25b2b2b

forth commit

Browse files

Files changed (7) hide show

app/data_processor.py +7 -1
app/database.py +26 -35
app/evaluation.py +1 -1
app/generate_ground_truth.py +79 -24
app/main.py +17 -11
data/ground-truth-retrieval.csv +3 -6
data/sqlite.db +0 -0

app/data_processor.py CHANGED Viewed

@@ -93,7 +93,13 @@ class DataProcessor:
         self.embeddings.append(embedding)
         logger.info(f"Processed transcript for video {video_id}")
-        return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
     def build_index(self, index_name):
         if not self.documents:

         self.embeddings.append(embedding)
         logger.info(f"Processed transcript for video {video_id}")
+        # Return a dictionary with the processed content and other relevant information
+        return {
+            'content': cleaned_transcript,
+            'metadata': metadata,
+            'index_name': f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
+        }
     def build_index(self, index_name):
         if not self.documents:

app/database.py CHANGED Viewed

@@ -17,7 +17,13 @@ class DatabaseHandler:
                     youtube_id TEXT UNIQUE,
                     title TEXT,
                     channel_name TEXT,
-                    processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
                 )
             ''')
             cursor.execute('''
@@ -52,7 +58,6 @@ class DatabaseHandler:
     def update_schema(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
-            # Check if columns exist, if not, add them
             cursor.execute("PRAGMA table_info(videos)")
             columns = [column[1] for column in cursor.fetchall()]
@@ -61,7 +66,8 @@ class DatabaseHandler:
                 ("view_count", "INTEGER"),
                 ("like_count", "INTEGER"),
                 ("comment_count", "INTEGER"),
-                ("video_duration", "TEXT")
             ]
             for col_name, col_type in new_columns:
@@ -75,8 +81,8 @@ class DatabaseHandler:
             cursor = conn.cursor()
             cursor.execute('''
                 INSERT OR REPLACE INTO videos
-                (youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration)
-                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
             ''', (
                 video_data['video_id'],
                 video_data['title'],
@@ -85,7 +91,8 @@ class DatabaseHandler:
                 video_data['view_count'],
                 video_data['like_count'],
                 video_data['comment_count'],
-                video_data['video_duration']
             ))
             conn.commit()
             return cursor.lastrowid
@@ -147,22 +154,19 @@ class DatabaseHandler:
             ''')
             return cursor.fetchall()
-    def get_elasticsearch_index_by_youtube_id(self, youtube_id, embedding_model):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
                 SELECT ei.index_name
                 FROM elasticsearch_indices ei
-                JOIN embedding_models em ON ei.embedding_model_id = em.id
                 JOIN videos v ON ei.video_id = v.id
-                WHERE v.youtube_id = ? AND em.model_name = ?
-            ''', (youtube_id, embedding_model))
             result = cursor.fetchone()
             return result[0] if result else None
     def get_transcript_content(self, youtube_id):
-        # This method assumes you're storing the transcript content in the database
-        # If you're not, you'll need to modify this to retrieve the transcript from wherever it's stored
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
@@ -173,26 +177,13 @@ class DatabaseHandler:
             result = cursor.fetchone()
             return result[0] if result else None
-    # If you're not already storing the transcript content, you'll need to add a method to do so:
-    def add_transcript_content(self, youtube_id, transcript_content):
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('''
-                UPDATE videos
-                SET transcript_content = ?
-                WHERE youtube_id = ?
-            ''', (transcript_content, youtube_id))
-            conn.commit()
-    def get_elasticsearch_index_by_youtube_id(self, youtube_id, embedding_model):
-        with sqlite3.connect(self.db_path) as conn:
-            cursor = conn.cursor()
-            cursor.execute('''
-                SELECT ei.index_name
-                FROM elasticsearch_indices ei
-                JOIN embedding_models em ON ei.embedding_model_id = em.id
-                JOIN videos v ON ei.video_id = v.id
-                WHERE v.youtube_id = ? AND em.model_name = ?
-            ''', (youtube_id, embedding_model))
-            result = cursor.fetchone()
-            return result[0] if result else None

                     youtube_id TEXT UNIQUE,
                     title TEXT,
                     channel_name TEXT,
+                    processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+                    upload_date TEXT,
+                    view_count INTEGER,
+                    like_count INTEGER,
+                    comment_count INTEGER,
+                    video_duration TEXT,
+                    transcript_content TEXT
                 )
             ''')
             cursor.execute('''
     def update_schema(self):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute("PRAGMA table_info(videos)")
             columns = [column[1] for column in cursor.fetchall()]
                 ("view_count", "INTEGER"),
                 ("like_count", "INTEGER"),
                 ("comment_count", "INTEGER"),
+                ("video_duration", "TEXT"),
+                ("transcript_content", "TEXT")
             ]
             for col_name, col_type in new_columns:
             cursor = conn.cursor()
             cursor.execute('''
                 INSERT OR REPLACE INTO videos
+                (youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration, transcript_content)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
             ''', (
                 video_data['video_id'],
                 video_data['title'],
                 video_data['view_count'],
                 video_data['like_count'],
                 video_data['comment_count'],
+                video_data['video_duration'],
+                video_data['transcript_content']
             ))
             conn.commit()
             return cursor.lastrowid
             ''')
             return cursor.fetchall()
+    def get_elasticsearch_index_by_youtube_id(self, youtube_id):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
                 SELECT ei.index_name
                 FROM elasticsearch_indices ei
                 JOIN videos v ON ei.video_id = v.id
+                WHERE v.youtube_id = ?
+            ''', (youtube_id,))
             result = cursor.fetchone()
             return result[0] if result else None
     def get_transcript_content(self, youtube_id):
         with sqlite3.connect(self.db_path) as conn:
             cursor = conn.cursor()
             cursor.execute('''
             result = cursor.fetchone()
             return result[0] if result else None
+    # This method is no longer needed as transcript is added in add_video
+    # def add_transcript_content(self, youtube_id, transcript_content):
+    #     with sqlite3.connect(self.db_path) as conn:
+    #         cursor = conn.cursor()
+    #         cursor.execute('''
+    #             UPDATE videos
+    #             SET transcript_content = ?
+    #             WHERE youtube_id = ?
+    #         ''', (transcript_content, youtube_id))
+    #         conn.commit()

app/evaluation.py CHANGED Viewed

@@ -78,7 +78,7 @@ class EvaluationSystem:
             question = row['question']
             video_id = row['video_id']
-            index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id, "multi-qa-MiniLM-L6-cos-v1")
             if not index_name:
                 print(f"No index found for video {video_id}. Skipping this question.")

             question = row['question']
             video_id = row['video_id']
+            index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
             if not index_name:
                 print(f"No index found for video {video_id}. Skipping this question.")

app/generate_ground_truth.py CHANGED Viewed

@@ -2,7 +2,49 @@ import pandas as pd
 import json
 from tqdm import tqdm
 import ollama
-from transcript_extractor import get_transcript
 def generate_questions(transcript):
     prompt_template = """
@@ -30,30 +72,50 @@ def generate_questions(transcript):
         )
         return json.loads(response['message']['content'])
     except Exception as e:
-        print(f"Error generating questions: {str(e)}")
         return None
 def generate_ground_truth(db_handler, data_processor, video_id):
-    transcript_data = get_transcript(video_id)
-    if transcript_data and 'transcript' in transcript_data:
-        full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
-        # Process the transcript
-        data_processor.process_transcript(video_id, transcript_data)
-    else:
-        print(f"Failed to retrieve transcript for video {video_id}")
         return None
-    questions = generate_questions(full_transcript)
     if questions and 'questions' in questions:
         df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
         csv_path = 'data/ground-truth-retrieval.csv'
         df.to_csv(csv_path, index=False)
-        print(f"Ground truth data saved to {csv_path}")
         return df
     else:
-        print("Failed to generate questions.")
     return None
 def generate_ground_truth_for_all_videos(db_handler, data_processor):
@@ -62,23 +124,16 @@ def generate_ground_truth_for_all_videos(db_handler, data_processor):
     for video in tqdm(videos, desc="Generating ground truth"):
         video_id = video[0]  # Assuming the video ID is the first element in the tuple
-        transcript_data = get_transcript(video_id)
-        if transcript_data and 'transcript' in transcript_data:
-            full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
-            # Process the transcript
-            data_processor.process_transcript(video_id, transcript_data)
-            questions = generate_questions(full_transcript)
-            if questions and 'questions' in questions:
-                all_questions.extend([(video_id, q) for q in questions['questions']])
-        else:
-            print(f"Failed to retrieve transcript for video {video_id}")
     if all_questions:
         df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
         csv_path = 'data/ground-truth-retrieval.csv'
         df.to_csv(csv_path, index=False)
-        print(f"Ground truth data for all videos saved to {csv_path}")
         return df
     else:
-        print("Failed to generate questions for any video.")
         return None

 import json
 from tqdm import tqdm
 import ollama
+from elasticsearch import Elasticsearch
+import sqlite3
+import logging
+import os
+import re
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def extract_model_name(index_name):
+    # Extract the model name from the index name
+    match = re.search(r'video_[^_]+_(.+)$', index_name)
+    if match:
+        return match.group(1)
+    return None
+def get_transcript_from_elasticsearch(es, index_name, video_id):
+    try:
+        result = es.search(index=index_name, body={
+            "query": {
+                "match": {
+                    "video_id": video_id
+                }
+            }
+        })
+        if result['hits']['hits']:
+            return result['hits']['hits'][0]['_source']['content']
+    except Exception as e:
+        logger.error(f"Error retrieving transcript from Elasticsearch: {str(e)}")
+    return None
+def get_transcript_from_sqlite(db_path, video_id):
+    try:
+        conn = sqlite3.connect(db_path)
+        cursor = conn.cursor()
+        cursor.execute("SELECT transcript_content FROM videos WHERE youtube_id = ?", (video_id,))
+        result = cursor.fetchone()
+        conn.close()
+        if result:
+            return result[0]
+    except Exception as e:
+        logger.error(f"Error retrieving transcript from SQLite: {str(e)}")
+    return None
 def generate_questions(transcript):
     prompt_template = """
         )
         return json.loads(response['message']['content'])
     except Exception as e:
+        logger.error(f"Error generating questions: {str(e)}")
         return None
 def generate_ground_truth(db_handler, data_processor, video_id):
+    es = Elasticsearch([f'http://{os.getenv("ELASTICSEARCH_HOST", "localhost")}:{os.getenv("ELASTICSEARCH_PORT", "9200")}'])
+    # Get the index name for the video
+    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
+    if not index_name:
+        logger.error(f"No Elasticsearch index found for video {video_id}")
+        return None
+    # Extract the model name from the index name
+    model_name = extract_model_name(index_name)
+    if not model_name:
+        logger.error(f"Could not extract model name from index name: {index_name}")
+        return None
+    transcript = None
+    if index_name:
+        transcript = get_transcript_from_elasticsearch(es, index_name, video_id)
+        logger.info(f"Transcript to generate questions using elasticsearch is {transcript}")
+    if not transcript:
+        transcript = db_handler.get_transcript_content(video_id)
+        logger.info(f"Transcript to generate questions using textual data is {transcript}")
+    if not transcript:
+        logger.error(f"Failed to retrieve transcript for video {video_id}")
         return None
+    questions = generate_questions(transcript)
     if questions and 'questions' in questions:
         df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
         csv_path = 'data/ground-truth-retrieval.csv'
         df.to_csv(csv_path, index=False)
+        logger.info(f"Ground truth data saved to {csv_path}")
         return df
     else:
+        logger.error("Failed to generate questions.")
     return None
 def generate_ground_truth_for_all_videos(db_handler, data_processor):
     for video in tqdm(videos, desc="Generating ground truth"):
         video_id = video[0]  # Assuming the video ID is the first element in the tuple
+        df = generate_ground_truth(db_handler, data_processor, video_id)
+        if df is not None:
+            all_questions.extend(df.values.tolist())
     if all_questions:
         df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
         csv_path = 'data/ground-truth-retrieval.csv'
         df.to_csv(csv_path, index=False)
+        logger.info(f"Ground truth data for all videos saved to {csv_path}")
         return df
     else:
+        logger.error("Failed to generate questions for any video.")
         return None

app/main.py CHANGED Viewed

@@ -15,6 +15,9 @@ import logging
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 @st.cache_resource
 def init_components():
     try:
@@ -31,6 +34,7 @@ def init_components():
         st.error("Please check your configuration and ensure all services are running.")
         return None, None, None, None, None
 def check_api_key():
     if test_api_key():
         st.success("YouTube API key is valid and working.")
@@ -88,7 +92,7 @@ Do not include any text outside of this JSON object.
 """
 def process_single_video(db_handler, data_processor, video_id, embedding_model):
-    existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
     if existing_index:
         logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
         return existing_index
@@ -99,6 +103,13 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
         st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
         return None
     video_data = {
         'video_id': video_id,
         'title': transcript_data['metadata'].get('title', 'Unknown Title'),
@@ -107,8 +118,10 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
         'view_count': int(transcript_data['metadata'].get('view_count', 0)),
         'like_count': int(transcript_data['metadata'].get('like_count', 0)),
         'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
-        'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration')
     }
     try:
         db_handler.add_video(video_data)
     except Exception as e:
@@ -116,13 +129,6 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
         st.error(f"Error adding video {video_id} to database: {str(e)}")
         return None
-    try:
-        data_processor.process_transcript(video_id, transcript_data)
-    except Exception as e:
-        logger.error(f"Error processing transcript: {str(e)}")
-        st.error(f"Error processing transcript for video {video_id}: {str(e)}")
-        return None
     index_name = f"video_{video_id}_{embedding_model}".lower()
     try:
         index_name = data_processor.build_index(index_name)
@@ -158,7 +164,7 @@ def process_multiple_videos(db_handler, data_processor, video_ids, embedding_mod
     return indices
 def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
-    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
     if not index_name:
         st.warning(f"Video {video_id} has not been processed yet. Processing now...")
         index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
@@ -201,7 +207,7 @@ def main():
             st.dataframe(video_df)
             selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
-            index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
             if index_name:
                 st.success(f"Using index: {index_name}")

 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 @st.cache_resource
 def init_components():
     try:
         st.error("Please check your configuration and ensure all services are running.")
         return None, None, None, None, None
 def check_api_key():
     if test_api_key():
         st.success("YouTube API key is valid and working.")
 """
 def process_single_video(db_handler, data_processor, video_id, embedding_model):
+    existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
     if existing_index:
         logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
         return existing_index
         st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
         return None
+    # Process the transcript
+    processed_data = data_processor.process_transcript(video_id, transcript_data)
+    if processed_data is None:
+        logger.error(f"Failed to process transcript for video {video_id}")
+        return None
+    # Prepare video data for database insertion
     video_data = {
         'video_id': video_id,
         'title': transcript_data['metadata'].get('title', 'Unknown Title'),
         'view_count': int(transcript_data['metadata'].get('view_count', 0)),
         'like_count': int(transcript_data['metadata'].get('like_count', 0)),
         'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
+        'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
+        'transcript_content': processed_data['content']  # Add this line to include the transcript content
     }
     try:
         db_handler.add_video(video_data)
     except Exception as e:
         st.error(f"Error adding video {video_id} to database: {str(e)}")
         return None
     index_name = f"video_{video_id}_{embedding_model}".lower()
     try:
         index_name = data_processor.build_index(index_name)
     return indices
 def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
+    index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
     if not index_name:
         st.warning(f"Video {video_id} has not been processed yet. Processing now...")
         index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
             st.dataframe(video_df)
             selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
+            index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
             if index_name:
                 st.success(f"Using index: {index_name}")

data/ground-truth-retrieval.csv CHANGED Viewed

@@ -1,7 +1,4 @@
 video_id,question
-zjkBMFhNj_g,What are Google Apps Script and its relation to user data security within a domain?
-zjkBMFhNj_g,"How can prompt injection attacks manipulate language models' outputs using shared documents like those managed by Gmail users or Microsoft Office files (Word, Excel)?"
-zjkBMFhNj_g,"In the context of AI-based systems such as large language models (LLMs), how might an attacker exploit these tools to exfiltrate sensitive user data from a Google Doc? Please provide details."
-zjkBMFhNj_g,"Can you explain prompt injection attacks and their potential impact on LLM predictions, including any specific examples provided in the discussion like using 'James Bond' as a trigger phrase for threat detection tasks or title generation?"
-zjkBMFhNj_g,Are there defenses against these types of language model (LLM) security threats similar to traditional cybersecurity measures such as prompt injection attacks and data poisoning? Please elaborate.
-zjkBMFhNj_g,"What does the future hold for LLMs considering their benefits, potential risks including adversarial exploitation like those discussed here, regulatory oversight needs due to privacy concerns (GDPR), mitigation of harmful outputs by these models in various applications?"

 video_id,question
+zjkBMFhNj_g,What is prompt injection and how does it work as an attack on language models?
+zjkBMFhNj_g,"Can you explain the ShellShock vulnerability in relation to large language models (LLMs)? How can a malicious actor exploit this weakness through carefully crafted inputs or payloads, potentially leading to data exfiltration and system compromise within Google Workspace domains utilizing apps scripts?"
+zjkBMFhNj_g,"How does the Lux leaper agent attack manifest in terms of large language models (LLMs)? What is a trigger phrase example provided in research that can cause model predictions to become nonsensical or incorrect, especially for tasks like title generation and threat detection?"

data/sqlite.db CHANGED Viewed

Binary files a/data/sqlite.db and b/data/sqlite.db differ