Spaces:
Running
Running
forth commit
Browse files- app/data_processor.py +7 -1
- app/database.py +26 -35
- app/evaluation.py +1 -1
- app/generate_ground_truth.py +79 -24
- app/main.py +17 -11
- data/ground-truth-retrieval.csv +3 -6
- data/sqlite.db +0 -0
app/data_processor.py
CHANGED
@@ -93,7 +93,13 @@ class DataProcessor:
|
|
93 |
self.embeddings.append(embedding)
|
94 |
|
95 |
logger.info(f"Processed transcript for video {video_id}")
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
def build_index(self, index_name):
|
99 |
if not self.documents:
|
|
|
93 |
self.embeddings.append(embedding)
|
94 |
|
95 |
logger.info(f"Processed transcript for video {video_id}")
|
96 |
+
|
97 |
+
# Return a dictionary with the processed content and other relevant information
|
98 |
+
return {
|
99 |
+
'content': cleaned_transcript,
|
100 |
+
'metadata': metadata,
|
101 |
+
'index_name': f"video_{video_id}_{self.embedding_model.get_sentence_embedding_dimension()}"
|
102 |
+
}
|
103 |
|
104 |
def build_index(self, index_name):
|
105 |
if not self.documents:
|
app/database.py
CHANGED
@@ -17,7 +17,13 @@ class DatabaseHandler:
|
|
17 |
youtube_id TEXT UNIQUE,
|
18 |
title TEXT,
|
19 |
channel_name TEXT,
|
20 |
-
processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
)
|
22 |
''')
|
23 |
cursor.execute('''
|
@@ -52,7 +58,6 @@ class DatabaseHandler:
|
|
52 |
def update_schema(self):
|
53 |
with sqlite3.connect(self.db_path) as conn:
|
54 |
cursor = conn.cursor()
|
55 |
-
# Check if columns exist, if not, add them
|
56 |
cursor.execute("PRAGMA table_info(videos)")
|
57 |
columns = [column[1] for column in cursor.fetchall()]
|
58 |
|
@@ -61,7 +66,8 @@ class DatabaseHandler:
|
|
61 |
("view_count", "INTEGER"),
|
62 |
("like_count", "INTEGER"),
|
63 |
("comment_count", "INTEGER"),
|
64 |
-
("video_duration", "TEXT")
|
|
|
65 |
]
|
66 |
|
67 |
for col_name, col_type in new_columns:
|
@@ -75,8 +81,8 @@ class DatabaseHandler:
|
|
75 |
cursor = conn.cursor()
|
76 |
cursor.execute('''
|
77 |
INSERT OR REPLACE INTO videos
|
78 |
-
(youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration)
|
79 |
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
80 |
''', (
|
81 |
video_data['video_id'],
|
82 |
video_data['title'],
|
@@ -85,7 +91,8 @@ class DatabaseHandler:
|
|
85 |
video_data['view_count'],
|
86 |
video_data['like_count'],
|
87 |
video_data['comment_count'],
|
88 |
-
video_data['video_duration']
|
|
|
89 |
))
|
90 |
conn.commit()
|
91 |
return cursor.lastrowid
|
@@ -147,22 +154,19 @@ class DatabaseHandler:
|
|
147 |
''')
|
148 |
return cursor.fetchall()
|
149 |
|
150 |
-
def get_elasticsearch_index_by_youtube_id(self, youtube_id
|
151 |
with sqlite3.connect(self.db_path) as conn:
|
152 |
cursor = conn.cursor()
|
153 |
cursor.execute('''
|
154 |
SELECT ei.index_name
|
155 |
FROM elasticsearch_indices ei
|
156 |
-
JOIN embedding_models em ON ei.embedding_model_id = em.id
|
157 |
JOIN videos v ON ei.video_id = v.id
|
158 |
-
WHERE v.youtube_id = ?
|
159 |
-
''', (youtube_id,
|
160 |
result = cursor.fetchone()
|
161 |
return result[0] if result else None
|
162 |
|
163 |
def get_transcript_content(self, youtube_id):
|
164 |
-
# This method assumes you're storing the transcript content in the database
|
165 |
-
# If you're not, you'll need to modify this to retrieve the transcript from wherever it's stored
|
166 |
with sqlite3.connect(self.db_path) as conn:
|
167 |
cursor = conn.cursor()
|
168 |
cursor.execute('''
|
@@ -173,26 +177,13 @@ class DatabaseHandler:
|
|
173 |
result = cursor.fetchone()
|
174 |
return result[0] if result else None
|
175 |
|
176 |
-
#
|
177 |
-
def add_transcript_content(self, youtube_id, transcript_content):
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
def get_elasticsearch_index_by_youtube_id(self, youtube_id, embedding_model):
|
188 |
-
with sqlite3.connect(self.db_path) as conn:
|
189 |
-
cursor = conn.cursor()
|
190 |
-
cursor.execute('''
|
191 |
-
SELECT ei.index_name
|
192 |
-
FROM elasticsearch_indices ei
|
193 |
-
JOIN embedding_models em ON ei.embedding_model_id = em.id
|
194 |
-
JOIN videos v ON ei.video_id = v.id
|
195 |
-
WHERE v.youtube_id = ? AND em.model_name = ?
|
196 |
-
''', (youtube_id, embedding_model))
|
197 |
-
result = cursor.fetchone()
|
198 |
-
return result[0] if result else None
|
|
|
17 |
youtube_id TEXT UNIQUE,
|
18 |
title TEXT,
|
19 |
channel_name TEXT,
|
20 |
+
processed_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
21 |
+
upload_date TEXT,
|
22 |
+
view_count INTEGER,
|
23 |
+
like_count INTEGER,
|
24 |
+
comment_count INTEGER,
|
25 |
+
video_duration TEXT,
|
26 |
+
transcript_content TEXT
|
27 |
)
|
28 |
''')
|
29 |
cursor.execute('''
|
|
|
58 |
def update_schema(self):
|
59 |
with sqlite3.connect(self.db_path) as conn:
|
60 |
cursor = conn.cursor()
|
|
|
61 |
cursor.execute("PRAGMA table_info(videos)")
|
62 |
columns = [column[1] for column in cursor.fetchall()]
|
63 |
|
|
|
66 |
("view_count", "INTEGER"),
|
67 |
("like_count", "INTEGER"),
|
68 |
("comment_count", "INTEGER"),
|
69 |
+
("video_duration", "TEXT"),
|
70 |
+
("transcript_content", "TEXT")
|
71 |
]
|
72 |
|
73 |
for col_name, col_type in new_columns:
|
|
|
81 |
cursor = conn.cursor()
|
82 |
cursor.execute('''
|
83 |
INSERT OR REPLACE INTO videos
|
84 |
+
(youtube_id, title, channel_name, upload_date, view_count, like_count, comment_count, video_duration, transcript_content)
|
85 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
86 |
''', (
|
87 |
video_data['video_id'],
|
88 |
video_data['title'],
|
|
|
91 |
video_data['view_count'],
|
92 |
video_data['like_count'],
|
93 |
video_data['comment_count'],
|
94 |
+
video_data['video_duration'],
|
95 |
+
video_data['transcript_content']
|
96 |
))
|
97 |
conn.commit()
|
98 |
return cursor.lastrowid
|
|
|
154 |
''')
|
155 |
return cursor.fetchall()
|
156 |
|
157 |
+
def get_elasticsearch_index_by_youtube_id(self, youtube_id):
|
158 |
with sqlite3.connect(self.db_path) as conn:
|
159 |
cursor = conn.cursor()
|
160 |
cursor.execute('''
|
161 |
SELECT ei.index_name
|
162 |
FROM elasticsearch_indices ei
|
|
|
163 |
JOIN videos v ON ei.video_id = v.id
|
164 |
+
WHERE v.youtube_id = ?
|
165 |
+
''', (youtube_id,))
|
166 |
result = cursor.fetchone()
|
167 |
return result[0] if result else None
|
168 |
|
169 |
def get_transcript_content(self, youtube_id):
|
|
|
|
|
170 |
with sqlite3.connect(self.db_path) as conn:
|
171 |
cursor = conn.cursor()
|
172 |
cursor.execute('''
|
|
|
177 |
result = cursor.fetchone()
|
178 |
return result[0] if result else None
|
179 |
|
180 |
+
# This method is no longer needed as transcript is added in add_video
|
181 |
+
# def add_transcript_content(self, youtube_id, transcript_content):
|
182 |
+
# with sqlite3.connect(self.db_path) as conn:
|
183 |
+
# cursor = conn.cursor()
|
184 |
+
# cursor.execute('''
|
185 |
+
# UPDATE videos
|
186 |
+
# SET transcript_content = ?
|
187 |
+
# WHERE youtube_id = ?
|
188 |
+
# ''', (transcript_content, youtube_id))
|
189 |
+
# conn.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/evaluation.py
CHANGED
@@ -78,7 +78,7 @@ class EvaluationSystem:
|
|
78 |
question = row['question']
|
79 |
video_id = row['video_id']
|
80 |
|
81 |
-
index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id
|
82 |
|
83 |
if not index_name:
|
84 |
print(f"No index found for video {video_id}. Skipping this question.")
|
|
|
78 |
question = row['question']
|
79 |
video_id = row['video_id']
|
80 |
|
81 |
+
index_name = self.db_handler.get_elasticsearch_index_by_youtube_id(video_id)
|
82 |
|
83 |
if not index_name:
|
84 |
print(f"No index found for video {video_id}. Skipping this question.")
|
app/generate_ground_truth.py
CHANGED
@@ -2,7 +2,49 @@ import pandas as pd
|
|
2 |
import json
|
3 |
from tqdm import tqdm
|
4 |
import ollama
|
5 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def generate_questions(transcript):
|
8 |
prompt_template = """
|
@@ -30,30 +72,50 @@ def generate_questions(transcript):
|
|
30 |
)
|
31 |
return json.loads(response['message']['content'])
|
32 |
except Exception as e:
|
33 |
-
|
34 |
return None
|
35 |
|
36 |
def generate_ground_truth(db_handler, data_processor, video_id):
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
return None
|
45 |
|
46 |
-
questions = generate_questions(
|
47 |
|
48 |
if questions and 'questions' in questions:
|
49 |
df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
|
50 |
|
51 |
csv_path = 'data/ground-truth-retrieval.csv'
|
52 |
df.to_csv(csv_path, index=False)
|
53 |
-
|
54 |
return df
|
55 |
else:
|
56 |
-
|
57 |
return None
|
58 |
|
59 |
def generate_ground_truth_for_all_videos(db_handler, data_processor):
|
@@ -62,23 +124,16 @@ def generate_ground_truth_for_all_videos(db_handler, data_processor):
|
|
62 |
|
63 |
for video in tqdm(videos, desc="Generating ground truth"):
|
64 |
video_id = video[0] # Assuming the video ID is the first element in the tuple
|
65 |
-
|
66 |
-
if
|
67 |
-
|
68 |
-
# Process the transcript
|
69 |
-
data_processor.process_transcript(video_id, transcript_data)
|
70 |
-
questions = generate_questions(full_transcript)
|
71 |
-
if questions and 'questions' in questions:
|
72 |
-
all_questions.extend([(video_id, q) for q in questions['questions']])
|
73 |
-
else:
|
74 |
-
print(f"Failed to retrieve transcript for video {video_id}")
|
75 |
|
76 |
if all_questions:
|
77 |
df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
|
78 |
csv_path = 'data/ground-truth-retrieval.csv'
|
79 |
df.to_csv(csv_path, index=False)
|
80 |
-
|
81 |
return df
|
82 |
else:
|
83 |
-
|
84 |
return None
|
|
|
2 |
import json
|
3 |
from tqdm import tqdm
|
4 |
import ollama
|
5 |
+
from elasticsearch import Elasticsearch
|
6 |
+
import sqlite3
|
7 |
+
import logging
|
8 |
+
import os
|
9 |
+
import re
|
10 |
+
|
11 |
+
logging.basicConfig(level=logging.INFO)
|
12 |
+
logger = logging.getLogger(__name__)
|
13 |
+
|
14 |
+
def extract_model_name(index_name):
|
15 |
+
# Extract the model name from the index name
|
16 |
+
match = re.search(r'video_[^_]+_(.+)$', index_name)
|
17 |
+
if match:
|
18 |
+
return match.group(1)
|
19 |
+
return None
|
20 |
+
|
21 |
+
def get_transcript_from_elasticsearch(es, index_name, video_id):
|
22 |
+
try:
|
23 |
+
result = es.search(index=index_name, body={
|
24 |
+
"query": {
|
25 |
+
"match": {
|
26 |
+
"video_id": video_id
|
27 |
+
}
|
28 |
+
}
|
29 |
+
})
|
30 |
+
if result['hits']['hits']:
|
31 |
+
return result['hits']['hits'][0]['_source']['content']
|
32 |
+
except Exception as e:
|
33 |
+
logger.error(f"Error retrieving transcript from Elasticsearch: {str(e)}")
|
34 |
+
return None
|
35 |
+
|
36 |
+
def get_transcript_from_sqlite(db_path, video_id):
|
37 |
+
try:
|
38 |
+
conn = sqlite3.connect(db_path)
|
39 |
+
cursor = conn.cursor()
|
40 |
+
cursor.execute("SELECT transcript_content FROM videos WHERE youtube_id = ?", (video_id,))
|
41 |
+
result = cursor.fetchone()
|
42 |
+
conn.close()
|
43 |
+
if result:
|
44 |
+
return result[0]
|
45 |
+
except Exception as e:
|
46 |
+
logger.error(f"Error retrieving transcript from SQLite: {str(e)}")
|
47 |
+
return None
|
48 |
|
49 |
def generate_questions(transcript):
|
50 |
prompt_template = """
|
|
|
72 |
)
|
73 |
return json.loads(response['message']['content'])
|
74 |
except Exception as e:
|
75 |
+
logger.error(f"Error generating questions: {str(e)}")
|
76 |
return None
|
77 |
|
78 |
def generate_ground_truth(db_handler, data_processor, video_id):
|
79 |
+
es = Elasticsearch([f'http://{os.getenv("ELASTICSEARCH_HOST", "localhost")}:{os.getenv("ELASTICSEARCH_PORT", "9200")}'])
|
80 |
+
|
81 |
+
# Get the index name for the video
|
82 |
+
index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
|
83 |
+
|
84 |
+
if not index_name:
|
85 |
+
logger.error(f"No Elasticsearch index found for video {video_id}")
|
86 |
+
return None
|
87 |
+
|
88 |
+
# Extract the model name from the index name
|
89 |
+
model_name = extract_model_name(index_name)
|
90 |
+
|
91 |
+
if not model_name:
|
92 |
+
logger.error(f"Could not extract model name from index name: {index_name}")
|
93 |
+
return None
|
94 |
+
|
95 |
+
transcript = None
|
96 |
+
if index_name:
|
97 |
+
transcript = get_transcript_from_elasticsearch(es, index_name, video_id)
|
98 |
+
logger.info(f"Transcript to generate questions using elasticsearch is {transcript}")
|
99 |
+
|
100 |
+
if not transcript:
|
101 |
+
transcript = db_handler.get_transcript_content(video_id)
|
102 |
+
logger.info(f"Transcript to generate questions using textual data is {transcript}")
|
103 |
+
|
104 |
+
if not transcript:
|
105 |
+
logger.error(f"Failed to retrieve transcript for video {video_id}")
|
106 |
return None
|
107 |
|
108 |
+
questions = generate_questions(transcript)
|
109 |
|
110 |
if questions and 'questions' in questions:
|
111 |
df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
|
112 |
|
113 |
csv_path = 'data/ground-truth-retrieval.csv'
|
114 |
df.to_csv(csv_path, index=False)
|
115 |
+
logger.info(f"Ground truth data saved to {csv_path}")
|
116 |
return df
|
117 |
else:
|
118 |
+
logger.error("Failed to generate questions.")
|
119 |
return None
|
120 |
|
121 |
def generate_ground_truth_for_all_videos(db_handler, data_processor):
|
|
|
124 |
|
125 |
for video in tqdm(videos, desc="Generating ground truth"):
|
126 |
video_id = video[0] # Assuming the video ID is the first element in the tuple
|
127 |
+
df = generate_ground_truth(db_handler, data_processor, video_id)
|
128 |
+
if df is not None:
|
129 |
+
all_questions.extend(df.values.tolist())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
|
131 |
if all_questions:
|
132 |
df = pd.DataFrame(all_questions, columns=['video_id', 'question'])
|
133 |
csv_path = 'data/ground-truth-retrieval.csv'
|
134 |
df.to_csv(csv_path, index=False)
|
135 |
+
logger.info(f"Ground truth data for all videos saved to {csv_path}")
|
136 |
return df
|
137 |
else:
|
138 |
+
logger.error("Failed to generate questions for any video.")
|
139 |
return None
|
app/main.py
CHANGED
@@ -15,6 +15,9 @@ import logging
|
|
15 |
logging.basicConfig(level=logging.DEBUG)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
|
|
|
|
|
|
18 |
@st.cache_resource
|
19 |
def init_components():
|
20 |
try:
|
@@ -31,6 +34,7 @@ def init_components():
|
|
31 |
st.error("Please check your configuration and ensure all services are running.")
|
32 |
return None, None, None, None, None
|
33 |
|
|
|
34 |
def check_api_key():
|
35 |
if test_api_key():
|
36 |
st.success("YouTube API key is valid and working.")
|
@@ -88,7 +92,7 @@ Do not include any text outside of this JSON object.
|
|
88 |
"""
|
89 |
|
90 |
def process_single_video(db_handler, data_processor, video_id, embedding_model):
|
91 |
-
existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id
|
92 |
if existing_index:
|
93 |
logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
|
94 |
return existing_index
|
@@ -99,6 +103,13 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
|
|
99 |
st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
|
100 |
return None
|
101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
video_data = {
|
103 |
'video_id': video_id,
|
104 |
'title': transcript_data['metadata'].get('title', 'Unknown Title'),
|
@@ -107,8 +118,10 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
|
|
107 |
'view_count': int(transcript_data['metadata'].get('view_count', 0)),
|
108 |
'like_count': int(transcript_data['metadata'].get('like_count', 0)),
|
109 |
'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
|
110 |
-
'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration')
|
|
|
111 |
}
|
|
|
112 |
try:
|
113 |
db_handler.add_video(video_data)
|
114 |
except Exception as e:
|
@@ -116,13 +129,6 @@ def process_single_video(db_handler, data_processor, video_id, embedding_model):
|
|
116 |
st.error(f"Error adding video {video_id} to database: {str(e)}")
|
117 |
return None
|
118 |
|
119 |
-
try:
|
120 |
-
data_processor.process_transcript(video_id, transcript_data)
|
121 |
-
except Exception as e:
|
122 |
-
logger.error(f"Error processing transcript: {str(e)}")
|
123 |
-
st.error(f"Error processing transcript for video {video_id}: {str(e)}")
|
124 |
-
return None
|
125 |
-
|
126 |
index_name = f"video_{video_id}_{embedding_model}".lower()
|
127 |
try:
|
128 |
index_name = data_processor.build_index(index_name)
|
@@ -158,7 +164,7 @@ def process_multiple_videos(db_handler, data_processor, video_ids, embedding_mod
|
|
158 |
return indices
|
159 |
|
160 |
def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
|
161 |
-
index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id
|
162 |
if not index_name:
|
163 |
st.warning(f"Video {video_id} has not been processed yet. Processing now...")
|
164 |
index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
|
@@ -201,7 +207,7 @@ def main():
|
|
201 |
st.dataframe(video_df)
|
202 |
selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
|
203 |
|
204 |
-
index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id
|
205 |
|
206 |
if index_name:
|
207 |
st.success(f"Using index: {index_name}")
|
|
|
15 |
logging.basicConfig(level=logging.DEBUG)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
+
logging.basicConfig(level=logging.INFO)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
@st.cache_resource
|
22 |
def init_components():
|
23 |
try:
|
|
|
34 |
st.error("Please check your configuration and ensure all services are running.")
|
35 |
return None, None, None, None, None
|
36 |
|
37 |
+
|
38 |
def check_api_key():
|
39 |
if test_api_key():
|
40 |
st.success("YouTube API key is valid and working.")
|
|
|
92 |
"""
|
93 |
|
94 |
def process_single_video(db_handler, data_processor, video_id, embedding_model):
|
95 |
+
existing_index = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
|
96 |
if existing_index:
|
97 |
logger.info(f"Video {video_id} has already been processed with {embedding_model}. Using existing index: {existing_index}")
|
98 |
return existing_index
|
|
|
103 |
st.error(f"Failed to retrieve transcript for video {video_id}. Please check if the video ID is correct and the video has captions available.")
|
104 |
return None
|
105 |
|
106 |
+
# Process the transcript
|
107 |
+
processed_data = data_processor.process_transcript(video_id, transcript_data)
|
108 |
+
if processed_data is None:
|
109 |
+
logger.error(f"Failed to process transcript for video {video_id}")
|
110 |
+
return None
|
111 |
+
|
112 |
+
# Prepare video data for database insertion
|
113 |
video_data = {
|
114 |
'video_id': video_id,
|
115 |
'title': transcript_data['metadata'].get('title', 'Unknown Title'),
|
|
|
118 |
'view_count': int(transcript_data['metadata'].get('view_count', 0)),
|
119 |
'like_count': int(transcript_data['metadata'].get('like_count', 0)),
|
120 |
'comment_count': int(transcript_data['metadata'].get('comment_count', 0)),
|
121 |
+
'video_duration': transcript_data['metadata'].get('duration', 'Unknown Duration'),
|
122 |
+
'transcript_content': processed_data['content'] # Add this line to include the transcript content
|
123 |
}
|
124 |
+
|
125 |
try:
|
126 |
db_handler.add_video(video_data)
|
127 |
except Exception as e:
|
|
|
129 |
st.error(f"Error adding video {video_id} to database: {str(e)}")
|
130 |
return None
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
index_name = f"video_{video_id}_{embedding_model}".lower()
|
133 |
try:
|
134 |
index_name = data_processor.build_index(index_name)
|
|
|
164 |
return indices
|
165 |
|
166 |
def ensure_video_processed(db_handler, data_processor, video_id, embedding_model):
|
167 |
+
index_name = db_handler.get_elasticsearch_index_by_youtube_id(video_id)
|
168 |
if not index_name:
|
169 |
st.warning(f"Video {video_id} has not been processed yet. Processing now...")
|
170 |
index_name = process_single_video(db_handler, data_processor, video_id, embedding_model)
|
|
|
207 |
st.dataframe(video_df)
|
208 |
selected_video_id = st.selectbox("Select a Video", video_df['youtube_id'].tolist(), format_func=lambda x: video_df[video_df['youtube_id'] == x]['title'].iloc[0])
|
209 |
|
210 |
+
index_name = db_handler.get_elasticsearch_index_by_youtube_id(selected_video_id)
|
211 |
|
212 |
if index_name:
|
213 |
st.success(f"Using index: {index_name}")
|
data/ground-truth-retrieval.csv
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
video_id,question
|
2 |
-
zjkBMFhNj_g,What
|
3 |
-
zjkBMFhNj_g,"
|
4 |
-
zjkBMFhNj_g,"
|
5 |
-
zjkBMFhNj_g,"Can you explain prompt injection attacks and their potential impact on LLM predictions, including any specific examples provided in the discussion like using 'James Bond' as a trigger phrase for threat detection tasks or title generation?"
|
6 |
-
zjkBMFhNj_g,Are there defenses against these types of language model (LLM) security threats similar to traditional cybersecurity measures such as prompt injection attacks and data poisoning? Please elaborate.
|
7 |
-
zjkBMFhNj_g,"What does the future hold for LLMs considering their benefits, potential risks including adversarial exploitation like those discussed here, regulatory oversight needs due to privacy concerns (GDPR), mitigation of harmful outputs by these models in various applications?"
|
|
|
1 |
video_id,question
|
2 |
+
zjkBMFhNj_g,What is prompt injection and how does it work as an attack on language models?
|
3 |
+
zjkBMFhNj_g,"Can you explain the ShellShock vulnerability in relation to large language models (LLMs)? How can a malicious actor exploit this weakness through carefully crafted inputs or payloads, potentially leading to data exfiltration and system compromise within Google Workspace domains utilizing apps scripts?"
|
4 |
+
zjkBMFhNj_g,"How does the Lux leaper agent attack manifest in terms of large language models (LLMs)? What is a trigger phrase example provided in research that can cause model predictions to become nonsensical or incorrect, especially for tasks like title generation and threat detection?"
|
|
|
|
|
|
data/sqlite.db
CHANGED
Binary files a/data/sqlite.db and b/data/sqlite.db differ
|
|