Spaces:

ganesh3
/

rag-youtube-assistant

Running

App Files Files Community

rag-youtube-assistant / app /generate_ground_truth.py

ganesh3

first modification

dbd33b2 9 months ago

raw

history blame

2.26 kB

	import os
	import pandas as pd
	import json
	from youtube_transcript_api import YouTubeTranscriptApi
	from tqdm import tqdm
	import requests

	OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost')
	OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434')

	def get_transcript(video_id):
	try:
	transcript = YouTubeTranscriptApi.get_transcript(video_id)
	return " ".join([entry['text'] for entry in transcript])
	except Exception as e:
	print(f"Error extracting transcript for video {video_id}: {str(e)}")
	return None

	def generate_questions(transcript):
	prompt_template = """
	You are an AI assistant tasked with generating questions based on a YouTube video transcript.
	Formulate 10 questions that a user might ask based on the provided transcript.
	Make the questions specific to the content of the transcript.
	The questions should be complete and not too short. Use as few words as possible from the transcript.

	The transcript:

	{transcript}

	Provide the output in parsable JSON without using code blocks:

	{{"questions": ["question1", "question2", ..., "question10"]}}
	""".strip()

	prompt = prompt_template.format(transcript=transcript)

	response = requests.post(f'http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate', json={
	'model': 'phi3.5',
	'prompt': prompt
	})

	if response.status_code == 200:
	return json.loads(response.json()['response'])
	else:
	print(f"Error: {response.status_code} - {response.text}")
	return None

	def main():
	video_id = "zjkBMFhNj_g"
	transcript = get_transcript(video_id)

	if transcript:
	questions = generate_questions(transcript)

	if questions:
	df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])

	os.makedirs('data', exist_ok=True)
	df.to_csv('data/ground-truth-retrieval.csv', index=False)
	print("Ground truth data saved to data/ground-truth-retrieval.csv")
	else:
	print("Failed to generate questions.")
	else:
	print("Failed to generate ground truth data due to transcript retrieval error.")

	if __name__ == "__main__":
	main()