File size: 2,263 Bytes
dbd33b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import pandas as pd
import json
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm import tqdm
import requests

OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'localhost')
OLLAMA_PORT = os.getenv('OLLAMA_PORT', '11434')

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([entry['text'] for entry in transcript])
    except Exception as e:
        print(f"Error extracting transcript for video {video_id}: {str(e)}")
        return None

def generate_questions(transcript):
    prompt_template = """
    You are an AI assistant tasked with generating questions based on a YouTube video transcript.
    Formulate 10 questions that a user might ask based on the provided transcript.
    Make the questions specific to the content of the transcript.
    The questions should be complete and not too short. Use as few words as possible from the transcript.

    The transcript:

    {transcript}

    Provide the output in parsable JSON without using code blocks:

    {{"questions": ["question1", "question2", ..., "question10"]}}
    """.strip()

    prompt = prompt_template.format(transcript=transcript)

    response = requests.post(f'http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/generate', json={
        'model': 'phi3.5',
        'prompt': prompt
    })
    
    if response.status_code == 200:
        return json.loads(response.json()['response'])
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None

def main():
    video_id = "zjkBMFhNj_g"
    transcript = get_transcript(video_id)
    
    if transcript:
        questions = generate_questions(transcript)
        
        if questions:
            df = pd.DataFrame([(video_id, q) for q in questions['questions']], columns=['video_id', 'question'])
            
            os.makedirs('data', exist_ok=True)
            df.to_csv('data/ground-truth-retrieval.csv', index=False)
            print("Ground truth data saved to data/ground-truth-retrieval.csv")
        else:
            print("Failed to generate questions.")
    else:
        print("Failed to generate ground truth data due to transcript retrieval error.")
        
if __name__ == "__main__":
    main()