File size: 6,363 Bytes
083a014
 
 
 
 
 
 
 
 
58ddc5a
083a014
e6813af
 
083a014
e6813af
083a014
e6813af
58ddc5a
 
 
e6813af
 
 
083a014
58ddc5a
 
 
 
 
 
083a014
 
 
 
 
 
 
 
 
 
 
 
58ddc5a
083a014
 
58ddc5a
083a014
 
 
 
 
58ddc5a
083a014
 
 
58ddc5a
e6813af
 
 
58ddc5a
 
e6813af
 
083a014
58ddc5a
083a014
 
58ddc5a
083a014
 
58ddc5a
083a014
a4ecef9
 
 
 
58ddc5a
 
083a014
 
 
 
 
 
 
 
 
 
58ddc5a
e6813af
 
 
 
083a014
58ddc5a
 
 
 
 
 
 
 
 
 
083a014
 
 
 
58ddc5a
083a014
 
58ddc5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
083a014
 
58ddc5a
083a014
 
58ddc5a
a4ecef9
58ddc5a
e6813af
58ddc5a
a4ecef9
58ddc5a
a4ecef9
083a014
 
58ddc5a
a4ecef9
 
083a014
 
a4ecef9
 
 
 
58ddc5a
083a014
 
a4ecef9
 
083a014
 
a4ecef9
 
58ddc5a
a4ecef9
 
 
58ddc5a
a4ecef9
083a014
 
 
e6813af
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import gradio as gr
import torch
import yt_dlp
import os
import subprocess
import json
import time
import langdetect
import uuid
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Load Hugging Face Token
HF_TOKEN = os.getenv("HF_TOKEN")

print("Starting the program...")
model_path = "Qwen/Qwen2.5-7B-Instruct"

# **Efficient Model Loading**
bnb_config = BitsAndBytesConfig(load_in_8bit=True)  # Use 8-bit precision to reduce memory usage

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    quantization_config=bnb_config,  # Load in 8-bit to save memory
    trust_remote_code=True
).to(device).eval()
print("Model successfully loaded.")

def generate_unique_filename(extension):
    return f"{uuid.uuid4()}{extension}"

def cleanup_files(*files):
    for file in files:
        if file and os.path.exists(file):
            os.remove(file)
            print(f"Removed file: {file}")

def download_youtube_audio(url):
    """Downloads audio from a YouTube video and converts it to WAV format."""
    print(f"Downloading audio from YouTube: {url}")
    output_path = generate_unique_filename(".wav")

    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'outtmpl': output_path,
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
        if os.path.exists(output_path + ".wav"):
            os.rename(output_path + ".wav", output_path)  # Ensure correct naming
    except Exception as e:
        return f"Error downloading audio: {str(e)}"
    
    return output_path if os.path.exists(output_path) else "Download Failed"

def transcribe_audio(file_path):
    """Transcribes audio using `insanely-fast-whisper` and handles large files efficiently."""
    print(f"Starting transcription of file: {file_path}")
    temp_audio = None

    if file_path.endswith(('.mp4', '.avi', '.mov', '.flv')):
        print("Video file detected. Extracting audio using ffmpeg...")
        temp_audio = generate_unique_filename(".wav")
        command = ["ffmpeg", "-i", file_path, "-q:a", "0", "-map", "a", temp_audio]
        subprocess.run(command, check=True)
        file_path = temp_audio  # Use extracted audio file

    output_file = generate_unique_filename(".json")
    command = [
        "insanely-fast-whisper",
        "--file-name", file_path,
        "--device-id", "0",
        "--model-name", "openai/whisper-large-v3",
        "--task", "transcribe",
        "--timestamp", "chunk",
        "--transcript-path", output_file
    ]

    try:
        subprocess.run(command, check=True)
    except Exception as e:
        return f"Error in transcription: {str(e)}"
    
    # Process the JSON file in chunks to avoid memory overflow
    result = []
    try:
        with open(output_file, "r") as f:
            for line in f:
                chunk = json.loads(line.strip())  # Read JSON line by line
                result.append(chunk.get("text", ""))
    except Exception as e:
        return f"Error reading transcription file: {str(e)}"

    cleanup_files(output_file)
    if temp_audio:
        cleanup_files(temp_audio)
    
    return " ".join(result)[:500000]  # Limit transcription size

def generate_summary_stream(transcription):
    """Summarizes the transcription efficiently to avoid memory overflow."""
    detected_language = langdetect.detect(transcription[:1000])  # Detect using a smaller portion

    # Use smaller chunks for processing
    chunk_size = 2000
    transcript_chunks = [transcription[i:i+chunk_size] for i in range(0, len(transcription), chunk_size)]
    summary_result = []

    for chunk in transcript_chunks[:3]:  # Process only the first 3 chunks to avoid OOM
        prompt = f"""Summarize the following video transcription in 150-300 words in {detected_language}:\n{chunk}"""
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        output_ids = model.generate(input_ids, max_length=300)  # Limit output size
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        summary_result.append(response)

    return "\n\n".join(summary_result)

def process_youtube(url):
    """Handles YouTube video processing: downloads audio, transcribes it, and cleans up."""
    if not url:
        return "Please enter a YouTube URL.", None

    audio_file = download_youtube_audio(url)
    if "Error" in audio_file or audio_file == "Download Failed":
        return audio_file, None

    transcription = transcribe_audio(audio_file)
    cleanup_files(audio_file)  # Clean up the downloaded file
    return transcription, None

def process_uploaded_video(video_path):
    """Processes uploaded video file for transcription."""
    transcription = transcribe_audio(video_path)
    return transcription, None

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎥 Video Transcription and Smart Summary
    Upload a video or provide a YouTube link to get a transcription and AI-generated summary.
    """)

    with gr.Tabs():
        with gr.TabItem("📤 Video Upload"):
            video_input = gr.Video()
            video_button = gr.Button("🚀 Process Video")
        
        with gr.TabItem("🔗 YouTube Link"):
            url_input = gr.Textbox(placeholder="https://www.youtube.com/watch?v=...")
            url_button = gr.Button("🚀 Process URL")

    transcription_output = gr.Textbox(label="📝 Transcription", lines=10, show_copy_button=True)
    summary_output = gr.Textbox(label="📊 Summary", lines=10, show_copy_button=True)
    summary_button = gr.Button("📝 Generate Summary")

    video_button.click(process_uploaded_video, inputs=[video_input], outputs=[transcription_output, summary_output])
    url_button.click(process_youtube, inputs=[url_input], outputs=[transcription_output, summary_output])
    summary_button.click(generate_summary_stream, inputs=[transcription_output], outputs=[summary_output])

demo.launch()