rag-youtube-assistant / app /generate_ground_truth.py
ganesh3's picture
first modification
dbd33b2
raw
history blame
2.26 kB
import os
import pandas as pd
import json
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm
import requests
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost')
OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434')
def get_transcript(video_id):
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id)
return " ".join([entry['text'] for entry in transcript])
except Exception as e:
print(f"Error extracting transcript for video {video_id}: {str(e)}")
return None
def generate_questions(transcript):
prompt_template = """
You are an AI assistant tasked with generating questions based on a YouTube video transcript.
Formulate 10 questions that a user might ask based on the provided transcript.
Make the questions specific to the content of the transcript.
The questions should be complete and not too short. Use as few words as possible from the transcript.
The transcript:
{transcript}
Provide the output in parsable JSON without using code blocks:
{{"questions": ["question1", "question2", ..., "question10"]}}
""".strip()
prompt = prompt_template.format(transcript=transcript)
response = requests.post(f'http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate', json={
'model': 'phi3.5',
'prompt': prompt
})
if response.status_code == 200:
return json.loads(response.json()['response'])
else:
print(f"Error: {response.status_code} - {response.text}")
return None
def main():
video_id = "zjkBMFhNj_g"
transcript = get_transcript(video_id)
if transcript:
questions = generate_questions(transcript)
if questions:
df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
os.makedirs('data', exist_ok=True)
df.to_csv('data/ground-truth-retrieval.csv', index=False)
print("Ground truth data saved to data/ground-truth-retrieval.csv")
else:
print("Failed to generate questions.")
else:
print("Failed to generate ground truth data due to transcript retrieval error.")
if __name__ == "__main__":
main()