ganesh3 commited on
Commit
a61b32e
·
1 Parent(s): 25b2b2b

forth commit

Browse files
app/data_processor.py CHANGED
@@ -93,7 +93,13 @@ class DataProcessor:
93
  self.embeddings.append(embedding)
94
 
95
  logger.info(f"Processed transcript for video {video_id}")
96
- return f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
 
 
 
 
 
 
97
 
98
  def build_index(self, index_name):
99
  if not self.documents:
 
93
  self.embeddings.append(embedding)
94
 
95
  logger.info(f"Processed transcript for video {video_id}")
96
+
97
+ # Return a dictionary with the processed content and other relevant information
98
+ return {
99
+ 'content': cleaned_transcript,
100
+ 'metadata': metadata,
101
+ 'index_name': f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
102
+ }
103
 
104
  def build_index(self, index_name):
105
  if not self.documents:
app/database.py CHANGED
@@ -17,7 +17,13 @@ class DatabaseHandler:
17
  youtube_id TEXT UNIQUE,
18
  title TEXT,
19
  channel_name TEXT,
20
- processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
 
 
 
 
 
 
21
  )
22
  ''')
23
  cursor.execute('''
@@ -52,7 +58,6 @@ class DatabaseHandler:
52
  def update_schema(self):
53
  with sqlite3.connect(self.db_path) as conn:
54
  cursor = conn.cursor()
55
- # Check if columns exist, if not, add them
56
  cursor.execute("PRAGMA table_info(videos)")
57
  columns = [column[1] for column in cursor.fetchall()]
58
 
@@ -61,7 +66,8 @@ class DatabaseHandler:
61
  ("view_count", "INTEGER"),
62
  ("like_count", "INTEGER"),
63
  ("comment_count", "INTEGER"),
64
- ("video_duration", "TEXT")
 
65
  ]
66
 
67
  for col_name, col_type in new_columns:
@@ -75,8 +81,8 @@ class DatabaseHandler:
75
  cursor = conn.cursor()
76
  cursor.execute('''
77
  INSERT OR REPLACE INTO videos
78
- (youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration)
79
- VALUES (?, ?, ?, ?, ?, ?, ?, ?)
80
  ''', (
81
  video_data['video_id'],
82
  video_data['title'],
@@ -85,7 +91,8 @@ class DatabaseHandler:
85
  video_data['view_count'],
86
  video_data['like_count'],
87
  video_data['comment_count'],
88
- video_data['video_duration']
 
89
  ))
90
  conn.commit()
91
  return cursor.lastrowid
@@ -147,22 +154,19 @@ class DatabaseHandler:
147
  ''')
148
  return cursor.fetchall()
149
 
150
- def get_elasticsearch_index_by_youtube_id(self, youtube_id, embedding_model):
151
  with sqlite3.connect(self.db_path) as conn:
152
  cursor = conn.cursor()
153
  cursor.execute('''
154
  SELECT ei.index_name
155
  FROM elasticsearch_indices ei
156
- JOIN embedding_models em ON ei.embedding_model_id = em.id
157
  JOIN videos v ON ei.video_id = v.id
158
- WHERE v.youtube_id = ? AND em.model_name = ?
159
- ''', (youtube_id, embedding_model))
160
  result = cursor.fetchone()
161
  return result[0] if result else None
162
 
163
  def get_transcript_content(self, youtube_id):
164
- # This method assumes you're storing the transcript content in the database
165
- # If you're not, you'll need to modify this to retrieve the transcript from wherever it's stored
166
  with sqlite3.connect(self.db_path) as conn:
167
  cursor = conn.cursor()
168
  cursor.execute('''
@@ -173,26 +177,13 @@ class DatabaseHandler:
173
  result = cursor.fetchone()
174
  return result[0] if result else None
175
 
176
- # If you're not already storing the transcript content, you'll need to add a method to do so:
177
- def add_transcript_content(self, youtube_id, transcript_content):
178
- with sqlite3.connect(self.db_path) as conn:
179
- cursor = conn.cursor()
180
- cursor.execute('''
181
- UPDATE videos
182
- SET transcript_content = ?
183
- WHERE youtube_id = ?
184
- ''', (transcript_content, youtube_id))
185
- conn.commit()
186
-
187
- def get_elasticsearch_index_by_youtube_id(self, youtube_id, embedding_model):
188
- with sqlite3.connect(self.db_path) as conn:
189
- cursor = conn.cursor()
190
- cursor.execute('''
191
- SELECT ei.index_name
192
- FROM elasticsearch_indices ei
193
- JOIN embedding_models em ON ei.embedding_model_id = em.id
194
- JOIN videos v ON ei.video_id = v.id
195
- WHERE v.youtube_id = ? AND em.model_name = ?
196
- ''', (youtube_id, embedding_model))
197
- result = cursor.fetchone()
198
- return result[0] if result else None
 
17
  youtube_id TEXT UNIQUE,
18
  title TEXT,
19
  channel_name TEXT,
20
+ processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
21
+ upload_date TEXT,
22
+ view_count INTEGER,
23
+ like_count INTEGER,
24
+ comment_count INTEGER,
25
+ video_duration TEXT,
26
+ transcript_content TEXT
27
  )
28
  ''')
29
  cursor.execute('''
 
58
  def update_schema(self):
59
  with sqlite3.connect(self.db_path) as conn:
60
  cursor = conn.cursor()
 
61
  cursor.execute("PRAGMA table_info(videos)")
62
  columns = [column[1] for column in cursor.fetchall()]
63
 
 
66
  ("view_count", "INTEGER"),
67
  ("like_count", "INTEGER"),
68
  ("comment_count", "INTEGER"),
69
+ ("video_duration", "TEXT"),
70
+ ("transcript_content", "TEXT")
71
  ]
72
 
73
  for col_name, col_type in new_columns:
 
81
  cursor = conn.cursor()
82
  cursor.execute('''
83
  INSERT OR REPLACE INTO videos
84
+ (youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration, transcript_content)
85
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
86
  ''', (
87
  video_data['video_id'],
88
  video_data['title'],
 
91
  video_data['view_count'],
92
  video_data['like_count'],
93
  video_data['comment_count'],
94
+ video_data['video_duration'],
95
+ video_data['transcript_content']
96
  ))
97
  conn.commit()
98
  return cursor.lastrowid
 
154
  ''')
155
  return cursor.fetchall()
156
 
157
+ def get_elasticsearch_index_by_youtube_id(self, youtube_id):
158
  with sqlite3.connect(self.db_path) as conn:
159
  cursor = conn.cursor()
160
  cursor.execute('''
161
  SELECT ei.index_name
162
  FROM elasticsearch_indices ei
 
163
  JOIN videos v ON ei.video_id = v.id
164
+ WHERE v.youtube_id = ?
165
+ ''', (youtube_id,))
166
  result = cursor.fetchone()
167
  return result[0] if result else None
168
 
169
  def get_transcript_content(self, youtube_id):
 
 
170
  with sqlite3.connect(self.db_path) as conn:
171
  cursor = conn.cursor()
172
  cursor.execute('''
 
177
  result = cursor.fetchone()
178
  return result[0] if result else None
179
 
180
+ # This method is no longer needed as transcript is added in add_video
181
+ # def add_transcript_content(self, youtube_id, transcript_content):
182
+ # with sqlite3.connect(self.db_path) as conn:
183
+ # cursor = conn.cursor()
184
+ # cursor.execute('''
185
+ # UPDATE videos
186
+ # SET transcript_content = ?
187
+ # WHERE youtube_id = ?
188
+ # ''', (transcript_content, youtube_id))
189
+ # conn.commit()
 
 
 
 
 
 
 
 
 
 
 
 
 
app/evaluation.py CHANGED
@@ -78,7 +78,7 @@ class EvaluationSystem:
78
  question = row['question']
79
  video_id = row['video_id']
80
 
81
- index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id, "multi-qa-MiniLM-L6-cos-v1")
82
 
83
  if not index_name:
84
  print(f"No index found for video {video_id}. Skipping this question.")
 
78
  question = row['question']
79
  video_id = row['video_id']
80
 
81
+ index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
82
 
83
  if not index_name:
84
  print(f"No index found for video {video_id}. Skipping this question.")
app/generate_ground_truth.py CHANGED
@@ -2,7 +2,49 @@ import pandas as pd
2
  import json
3
  from tqdm import tqdm
4
  import ollama
5
- from transcript_extractor import get_transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def generate_questions(transcript):
8
  prompt_template = """
@@ -30,30 +72,50 @@ def generate_questions(transcript):
30
  )
31
  return json.loads(response['message']['content'])
32
  except Exception as e:
33
- print(f"Error generating questions: {str(e)}")
34
  return None
35
 
36
  def generate_ground_truth(db_handler, data_processor, video_id):
37
- transcript_data = get_transcript(video_id)
38
- if transcript_data and 'transcript' in transcript_data:
39
- full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
40
- # Process the transcript
41
- data_processor.process_transcript(video_id, transcript_data)
42
- else:
43
- print(f"Failed to retrieve transcript for video {video_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  return None
45
 
46
- questions = generate_questions(full_transcript)
47
 
48
  if questions and 'questions' in questions:
49
  df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
50
 
51
  csv_path = 'data/ground-truth-retrieval.csv'
52
  df.to_csv(csv_path, index=False)
53
- print(f"Ground truth data saved to {csv_path}")
54
  return df
55
  else:
56
- print("Failed to generate questions.")
57
  return None
58
 
59
  def generate_ground_truth_for_all_videos(db_handler, data_processor):
@@ -62,23 +124,16 @@ def generate_ground_truth_for_all_videos(db_handler, data_processor):
62
 
63
  for video in tqdm(videos, desc="Generating ground truth"):
64
  video_id = video[0] # Assuming the video ID is the first element in the tuple
65
- transcript_data = get_transcript(video_id)
66
- if transcript_data and 'transcript' in transcript_data:
67
- full_transcript = " ".join([entry['text'] for entry in transcript_data['transcript']])
68
- # Process the transcript
69
- data_processor.process_transcript(video_id, transcript_data)
70
- questions = generate_questions(full_transcript)
71
- if questions and 'questions' in questions:
72
- all_questions.extend([(video_id, q) for q in questions['questions']])
73
- else:
74
- print(f"Failed to retrieve transcript for video {video_id}")
75
 
76
  if all_questions:
77
  df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
78
  csv_path = 'data/ground-truth-retrieval.csv'
79
  df.to_csv(csv_path, index=False)
80
- print(f"Ground truth data for all videos saved to {csv_path}")
81
  return df
82
  else:
83
- print("Failed to generate questions for any video.")
84
  return None
 
2
  import json
3
  from tqdm import tqdm
4
  import ollama
5
+ from elasticsearch import Elasticsearch
6
+ import sqlite3
7
+ import logging
8
+ import os
9
+ import re
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ def extract_model_name(index_name):
15
+ # Extract the model name from the index name
16
+ match = re.search(r'video_[^_]+_(.+)$', index_name)
17
+ if match:
18
+ return match.group(1)
19
+ return None
20
+
21
+ def get_transcript_from_elasticsearch(es, index_name, video_id):
22
+ try:
23
+ result = es.search(index=index_name, body={
24
+ "query": {
25
+ "match": {
26
+ "video_id": video_id
27
+ }
28
+ }
29
+ })
30
+ if result['hits']['hits']:
31
+ return result['hits']['hits'][0]['_source']['content']
32
+ except Exception as e:
33
+ logger.error(f"Error retrieving transcript from Elasticsearch: {str(e)}")
34
+ return None
35
+
36
+ def get_transcript_from_sqlite(db_path, video_id):
37
+ try:
38
+ conn = sqlite3.connect(db_path)
39
+ cursor = conn.cursor()
40
+ cursor.execute("SELECT transcript_content FROM videos WHERE youtube_id = ?", (video_id,))
41
+ result = cursor.fetchone()
42
+ conn.close()
43
+ if result:
44
+ return result[0]
45
+ except Exception as e:
46
+ logger.error(f"Error retrieving transcript from SQLite: {str(e)}")
47
+ return None
48
 
49
  def generate_questions(transcript):
50
  prompt_template = """
 
72
  )
73
  return json.loads(response['message']['content'])
74
  except Exception as e:
75
+ logger.error(f"Error generating questions: {str(e)}")
76
  return None
77
 
78
  def generate_ground_truth(db_handler, data_processor, video_id):
79
+ es = Elasticsearch([f'http://{os.getenv("ELASTICSEARCH_HOST", "localhost")}:{os.getenv("ELASTICSEARCH_PORT", "9200")}'])
80
+
81
+ # Get the index name for the video
82
+ index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
83
+
84
+ if not index_name:
85
+ logger.error(f"No Elasticsearch index found for video {video_id}")
86
+ return None
87
+
88
+ # Extract the model name from the index name
89
+ model_name = extract_model_name(index_name)
90
+
91
+ if not model_name:
92
+ logger.error(f"Could not extract model name from index name: {index_name}")
93
+ return None
94
+
95
+ transcript = None
96
+ if index_name:
97
+ transcript = get_transcript_from_elasticsearch(es, index_name, video_id)
98
+ logger.info(f"Transcript to generate questions using elasticsearch is {transcript}")
99
+
100
+ if not transcript:
101
+ transcript = db_handler.get_transcript_content(video_id)
102
+ logger.info(f"Transcript to generate questions using textual data is {transcript}")
103
+
104
+ if not transcript:
105
+ logger.error(f"Failed to retrieve transcript for video {video_id}")
106
  return None
107
 
108
+ questions = generate_questions(transcript)
109
 
110
  if questions and 'questions' in questions:
111
  df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
112
 
113
  csv_path = 'data/ground-truth-retrieval.csv'
114
  df.to_csv(csv_path, index=False)
115
+ logger.info(f"Ground truth data saved to {csv_path}")
116
  return df
117
  else:
118
+ logger.error("Failed to generate questions.")
119
  return None
120
 
121
  def generate_ground_truth_for_all_videos(db_handler, data_processor):
 
124
 
125
  for video in tqdm(videos, desc="Generating ground truth"):
126
  video_id = video[0] # Assuming the video ID is the first element in the tuple
127
+ df = generate_ground_truth(db_handler, data_processor, video_id)
128
+ if df is not None:
129
+ all_questions.extend(df.values.tolist())
 
 
 
 
 
 
 
130
 
131
  if all_questions:
132
  df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
133
  csv_path = 'data/ground-truth-retrieval.csv'
134
  df.to_csv(csv_path, index=False)
135
+ logger.info(f"Ground truth data for all videos saved to {csv_path}")
136
  return df
137
  else:
138
+ logger.error("Failed to generate questions for any video.")
139
  return None
app/main.py CHANGED
@@ -15,6 +15,9 @@ import logging
15
  logging.basicConfig(level=logging.DEBUG)
16
  logger = logging.getLogger(__name__)
17
 
 
 
 
18
  @st.cache_resource
19
  def init_components():
20
  try:
@@ -31,6 +34,7 @@ def init_components():
31
  st.error("Please check your configuration and ensure all services are running.")
32
  return None, None, None, None, None
33
 
 
34
  def check_api_key():
35
  if test_api_key():
36
  st.success("YouTube API key is valid and working.")
@@ -88,7 +92,7 @@ Do not include any text outside of this JSON object.
88
  """
89
 
90
  def process_single_video(db_handler, data_processor, video_id, embedding_model):
91
- existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
92
  if existing_index:
93
  logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
94
  return existing_index
@@ -99,6 +103,13 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
99
  st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
100
  return None
101
 
 
 
 
 
 
 
 
102
  video_data = {
103
  'video_id': video_id,
104
  'title': transcript_data['metadata'].get('title', 'Unknown Title'),
@@ -107,8 +118,10 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
107
  'view_count': int(transcript_data['metadata'].get('view_count', 0)),
108
  'like_count': int(transcript_data['metadata'].get('like_count', 0)),
109
  'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
110
- 'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration')
 
111
  }
 
112
  try:
113
  db_handler.add_video(video_data)
114
  except Exception as e:
@@ -116,13 +129,6 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
116
  st.error(f"Error adding video {video_id} to database: {str(e)}")
117
  return None
118
 
119
- try:
120
- data_processor.process_transcript(video_id, transcript_data)
121
- except Exception as e:
122
- logger.error(f"Error processing transcript: {str(e)}")
123
- st.error(f"Error processing transcript for video {video_id}: {str(e)}")
124
- return None
125
-
126
  index_name = f"video_{video_id}_{embedding_model}".lower()
127
  try:
128
  index_name = data_processor.build_index(index_name)
@@ -158,7 +164,7 @@ def process_multiple_videos(db_handler, data_processor, video_ids, embedding_mod
158
  return indices
159
 
160
  def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
161
- index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id, embedding_model)
162
  if not index_name:
163
  st.warning(f"Video {video_id} has not been processed yet. Processing now...")
164
  index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
@@ -201,7 +207,7 @@ def main():
201
  st.dataframe(video_df)
202
  selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
203
 
204
- index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id, embedding_model)
205
 
206
  if index_name:
207
  st.success(f"Using index: {index_name}")
 
15
  logging.basicConfig(level=logging.DEBUG)
16
  logger = logging.getLogger(__name__)
17
 
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
  @st.cache_resource
22
  def init_components():
23
  try:
 
34
  st.error("Please check your configuration and ensure all services are running.")
35
  return None, None, None, None, None
36
 
37
+
38
  def check_api_key():
39
  if test_api_key():
40
  st.success("YouTube API key is valid and working.")
 
92
  """
93
 
94
  def process_single_video(db_handler, data_processor, video_id, embedding_model):
95
+ existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
96
  if existing_index:
97
  logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
98
  return existing_index
 
103
  st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
104
  return None
105
 
106
+ # Process the transcript
107
+ processed_data = data_processor.process_transcript(video_id, transcript_data)
108
+ if processed_data is None:
109
+ logger.error(f"Failed to process transcript for video {video_id}")
110
+ return None
111
+
112
+ # Prepare video data for database insertion
113
  video_data = {
114
  'video_id': video_id,
115
  'title': transcript_data['metadata'].get('title', 'Unknown Title'),
 
118
  'view_count': int(transcript_data['metadata'].get('view_count', 0)),
119
  'like_count': int(transcript_data['metadata'].get('like_count', 0)),
120
  'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
121
+ 'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
122
+ 'transcript_content': processed_data['content'] # Add this line to include the transcript content
123
  }
124
+
125
  try:
126
  db_handler.add_video(video_data)
127
  except Exception as e:
 
129
  st.error(f"Error adding video {video_id} to database: {str(e)}")
130
  return None
131
 
 
 
 
 
 
 
 
132
  index_name = f"video_{video_id}_{embedding_model}".lower()
133
  try:
134
  index_name = data_processor.build_index(index_name)
 
164
  return indices
165
 
166
  def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
167
+ index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
168
  if not index_name:
169
  st.warning(f"Video {video_id} has not been processed yet. Processing now...")
170
  index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
 
207
  st.dataframe(video_df)
208
  selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
209
 
210
+ index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
211
 
212
  if index_name:
213
  st.success(f"Using index: {index_name}")
data/ground-truth-retrieval.csv CHANGED
@@ -1,7 +1,4 @@
1
  video_id,question
2
- zjkBMFhNj_g,What are Google Apps Script and its relation to user data security within a domain?
3
- zjkBMFhNj_g,"How can prompt injection attacks manipulate language models' outputs using shared documents like those managed by Gmail users or Microsoft Office files (Word, Excel)?"
4
- zjkBMFhNj_g,"In the context of AI-based systems such as large language models (LLMs), how might an attacker exploit these tools to exfiltrate sensitive user data from a Google Doc? Please provide details."
5
- zjkBMFhNj_g,"Can you explain prompt injection attacks and their potential impact on LLM predictions, including any specific examples provided in the discussion like using 'James Bond' as a trigger phrase for threat detection tasks or title generation?"
6
- zjkBMFhNj_g,Are there defenses against these types of language model (LLM) security threats similar to traditional cybersecurity measures such as prompt injection attacks and data poisoning? Please elaborate.
7
- zjkBMFhNj_g,"What does the future hold for LLMs considering their benefits, potential risks including adversarial exploitation like those discussed here, regulatory oversight needs due to privacy concerns (GDPR), mitigation of harmful outputs by these models in various applications?"
 
1
  video_id,question
2
+ zjkBMFhNj_g,What is prompt injection and how does it work as an attack on language models?
3
+ zjkBMFhNj_g,"Can you explain the ShellShock vulnerability in relation to large language models (LLMs)? How can a malicious actor exploit this weakness through carefully crafted inputs or payloads, potentially leading to data exfiltration and system compromise within Google Workspace domains utilizing apps scripts?"
4
+ zjkBMFhNj_g,"How does the Lux leaper agent attack manifest in terms of large language models (LLMs)? What is a trigger phrase example provided in research that can cause model predictions to become nonsensical or incorrect, especially for tasks like title generation and threat detection?"
 
 
 
data/sqlite.db CHANGED
Binary files a/data/sqlite.db and b/data/sqlite.db differ