File size: 4,730 Bytes
dad4b00
dad2a9b
dad4b00
 
 
dad2a9b
dad4b00
043212a
dad4b00
 
 
 
 
dad2a9b
dad4b00
 
ee74fc7
dad4b00
 
 
dad2a9b
dad4b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dad2a9b
dad4b00
 
dad2a9b
dad4b00
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1545980
 
39e0470
 
dad4b00
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import tempfile
import os
import time

# Constants
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 25  # File size limit in MB
YT_LENGTH_LIMIT_S = 3600  # 1 hour YouTube file limit

# Device configuration (CUDA if available)
device = 0 if torch.cuda.is_available() else "cpu"

# Load Whisper model and processor
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)

def transcribe_audio(inputs):
    """Transcribe audio using Whisper model."""
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    
    # Check file size (max 25MB)
    if os.path.getsize(inputs) > FILE_LIMIT_MB * 1024 * 1024:
        raise gr.Error(f"File size exceeds {FILE_LIMIT_MB}MB limit.")
    
    # Preprocess audio input
    audio_input = processor(inputs, return_tensors="pt", sampling_rate=16000).to(device)
    
    # Generate transcription
    predicted_ids = model.generate(audio_input.input_values, max_length=448)
    
    # Decode the transcription output
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

def _return_yt_html_embed(yt_url):
    """Return YouTube embed HTML for display."""
    video_id = yt_url.split("?v=")[-1]
    html_embed = f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
    return html_embed

def download_yt_audio(yt_url, filename):
    """Download audio from a YouTube URL."""
    info_loader = youtube_dl.YoutubeDL()
    
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(f"Download error: {str(err)}")
    
    # Check video length
    file_length_s = int(info.get("duration", 0))
    
    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
        raise gr.Error(f"Maximum YouTube video length is {yt_length_limit_hms}, but video is {file_length_hms}.")
    
    # Download the video
    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except youtube_dl.utils.ExtractorError as err:
            raise gr.Error(f"Error while downloading video: {str(err)}")

def yt_transcribe(yt_url):
    """Transcribe YouTube video using Whisper model."""
    html_embed = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)

        with open(filepath, "rb") as file:
            audio_input = file.read()

    # Process and transcribe
    transcription = transcribe_audio(audio_input)
    return html_embed, transcription

# Create Gradio interface
demo = gr.Blocks()

# Microphone transcription interface
mf_transcribe = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Transcription (Microphone)",
    description="Transcribe audio from your microphone. File size limit is 25MB."
)

# File upload transcription interface
file_transcribe = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Transcription (File)",
    description="Upload an audio file to transcribe. File size limit is 25MB."
)

# YouTube video transcription interface
yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.inputs.Textbox(lines=1, placeholder="Paste YouTube URL", label="YouTube URL"),
    ],
    outputs=["html", "text"],
    layout="horizontal",
    theme="huggingface",
    title="Free Transcript Maker",
    description="Upload an audio file (WAV, MP3, etc.) up to 25MB to get its transcription. The transcript will be displayed and available for download. Please use responsibly."
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])

demo.launch(enable_queue=True)