File size: 4,501 Bytes
a6eae86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import gradio as gr
import torch
import yt_dlp
import os
import subprocess
import json
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM
import spaces
import time
import langdetect
import uuid

HF_TOKEN = os.environ.get("HF_TOKEN")
print("Starting the program...")

model_path = "Qwen/Qwen2.5-7B-Instruct"
print(f"Loading model {model_path}...")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True).cuda()
model = model.eval()
print("Model successfully loaded.")

def generate_unique_filename(extension):
    return f"{uuid.uuid4()}{extension}"

def cleanup_files(*files):
    for file in files:
        if file and os.path.exists(file):
            os.remove(file)
            print(f"Removed file: {file}")

def download_youtube_audio(url):
    print(f"Downloading audio from YouTube: {url}")
    output_path = generate_unique_filename(".wav")
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }],
        'outtmpl': output_path,
        'keepvideo': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    
    if os.path.exists(output_path + ".wav"):
        os.rename(output_path + ".wav", output_path)
    
    return output_path

@spaces.GPU(duration=90)
def transcribe_audio(file_path):
    print(f"Starting transcription of file: {file_path}")
    temp_audio = None
    if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
        print("Video file detected. Extracting audio using ffmpeg...")
        temp_audio = generate_unique_filename(".wav")
        command = ["ffmpeg", "-i", file_path, "-q:a", "0", "-map", "a", temp_audio]
        subprocess.run(command, check=True)
        file_path = temp_audio
    
    output_file = generate_unique_filename(".json")
    command = [
        "insanely-fast-whisper",
        "--file-name", file_path,
        "--device-id", "0",
        "--model-name", "openai/whisper-large-v3",
        "--task", "transcribe",
        "--timestamp", "chunk",
        "--transcript-path", output_file
    ]
    subprocess.run(command, check=True)
    
    with open(output_file, "r") as f:
        transcription = json.load(f)
    
    result = transcription.get("text", " ".join([chunk["text"] for chunk in transcription.get("chunks", [])]))
    
    cleanup_files(output_file)
    if temp_audio:
        cleanup_files(temp_audio)
    
    return result

def generate_summary_stream(transcription):
    detected_language = langdetect.detect(transcription)
    prompt = f"""Summarize the following video transcription in 150-300 words in {detected_language}:
    {transcription[:300000]}..."""
    
    response, history = model.chat(tokenizer, prompt, history=[])
    return response

def process_youtube(url):
    if not url:
        return "Please enter a YouTube URL.", None
    audio_file = download_youtube_audio(url)
    transcription = transcribe_audio(audio_file)
    cleanup_files(audio_file)
    return transcription, None

def process_uploaded_video(video_path):
    transcription = transcribe_audio(video_path)
    return transcription, None

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎥 Video Transcription and Smart Summary
    Upload a video or provide a YouTube link to get a transcription and AI-generated summary.
    """)
    
    with gr.Tabs():
        with gr.TabItem("📤 Video Upload"):
            video_input = gr.Video()
            video_button = gr.Button("🚀 Process Video")
        
        with gr.TabItem("🔗 YouTube Link"):
            url_input = gr.Textbox(placeholder="https://www.youtube.com/watch?v=...")
            url_button = gr.Button("🚀 Process URL")
    
    transcription_output = gr.Textbox(label="📝 Transcription", lines=10, show_copy_button=True)
    summary_output = gr.Textbox(label="📊 Summary", lines=10, show_copy_button=True)
    summary_button = gr.Button("📝 Generate Summary")
    
    video_button.click(process_uploaded_video, inputs=[video_input], outputs=[transcription_output, summary_output])
    url_button.click(process_youtube, inputs=[url_input], outputs=[transcription_output, summary_output])
    summary_button.click(generate_summary_stream, inputs=[transcription_output], outputs=[summary_output])

demo.launch()