Spaces:
Running
Running
import os | |
import pandas as pd | |
import json | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from tqdm import tqdm | |
import requests | |
OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost') | |
OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434') | |
def get_transcript(video_id): | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
return " ".join([entry['text'] for entry in transcript]) | |
except Exception as e: | |
print(f"Error extracting transcript for video {video_id}: {str(e)}") | |
return None | |
def generate_questions(transcript): | |
prompt_template = """ | |
You are an AI assistant tasked with generating questions based on a YouTube video transcript. | |
Formulate 10 questions that a user might ask based on the provided transcript. | |
Make the questions specific to the content of the transcript. | |
The questions should be complete and not too short. Use as few words as possible from the transcript. | |
The transcript: | |
{transcript} | |
Provide the output in parsable JSON without using code blocks: | |
{{"questions": ["question1", "question2", ..., "question10"]}} | |
""".strip() | |
prompt = prompt_template.format(transcript=transcript) | |
response = requests.post(f'http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate', json={ | |
'model': 'phi3.5', | |
'prompt': prompt | |
}) | |
if response.status_code == 200: | |
return json.loads(response.json()['response']) | |
else: | |
print(f"Error: {response.status_code} - {response.text}") | |
return None | |
def main(): | |
video_id = "zjkBMFhNj_g" | |
transcript = get_transcript(video_id) | |
if transcript: | |
questions = generate_questions(transcript) | |
if questions: | |
df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question']) | |
os.makedirs('data', exist_ok=True) | |
df.to_csv('data/ground-truth-retrieval.csv', index=False) | |
print("Ground truth data saved to data/ground-truth-retrieval.csv") | |
else: | |
print("Failed to generate questions.") | |
else: | |
print("Failed to generate ground truth data due to transcript retrieval error.") | |
if __name__ == "__main__": | |
main() |