File size: 4,648 Bytes
9ba2a1c
a67942c
9ba2a1c
 
 
bce555f
9ba2a1c
 
a5bf333
9ba2a1c
c3e624b
9ba2a1c
bce555f
3b5175f
9ba2a1c
 
bce555f
9ba2a1c
bce555f
9ba2a1c
 
bce555f
9ba2a1c
 
 
57bf973
9ba2a1c
 
a67942c
bce555f
 
 
 
 
 
 
 
 
 
 
51499e8
9334a23
 
 
891b8fc
a67942c
891b8fc
 
 
9334a23
891b8fc
 
 
 
9334a23
891b8fc
9334a23
 
 
 
 
891b8fc
 
 
bce555f
9334a23
 
 
bce555f
9ba2a1c
bce555f
 
 
 
 
 
 
42f6a29
9ba2a1c
 
bce555f
 
 
 
9ba2a1c
bce555f
9ba2a1c
 
bce555f
9ba2a1c
 
 
 
bce555f
9ba2a1c
bce555f
 
 
 
9ba2a1c
bce555f
9ba2a1c
bce555f
 
 
 
 
9ba2a1c
bce555f
 
 
 
 
 
9ba2a1c
 
 
bce555f
9ba2a1c
 
 
 
 
a5bf333
 
9ba2a1c
 
42f6a29
bce555f
 
9ba2a1c
 
 
 
 
bce555f
a5bf333
9ba2a1c
 
42f6a29
bce555f
 
9ba2a1c
 
 
bce555f
9ba2a1c
a5bf333
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from urllib.parse import urlparse, parse_qs

import tempfile
import time
import os
import numpy as np

# Constants
MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # 1 hour limit

# Device selection
device = 0 if torch.cuda.is_available() else "cpu"

# Load Whisper pipeline
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

# Extract YouTube Video ID
def _extract_yt_video_id(yt_url):
    parsed_url = urlparse(yt_url)
    return parse_qs(parsed_url.query).get("v", [""])[0]

# Embed YouTube Video in HTML
def _return_yt_html_embed(yt_url):
    video_id = _extract_yt_video_id(yt_url)
    if not video_id:
        raise gr.Error("Invalid YouTube URL. Please check and try again.")
    return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe> </center>'

# Transcription function (Fix applied)
def transcribe(audio_file, task):
    if audio_file is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")

    # Open file as binary to ensure correct data type
    with open(audio_file, "rb") as f:
        audio_data = f.read()

    # Read audio using ffmpeg_read (correcting input format)
    audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)

    # Convert to proper format
    inputs = {
        "raw": np.array(audio_array),  
        "sampling_rate": pipe.feature_extractor.sampling_rate
    }

    # Perform transcription
    result = pipe(
        inputs,
        batch_size=BATCH_SIZE,
        generate_kwargs={"task": task},
        return_timestamps=True
    )

    return result["text"]
# Download YouTube audio
def download_yt_audio(yt_url, filename):
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": filename,
        "postprocessors": [
            {"key": "FFmpegExtractAudio", "preferredcodec": "mp3", "preferredquality": "192"}
        ],
    }
    
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            info = ydl.extract_info(yt_url, download=False)
            file_length_s = info.get("duration", 0)  # Duration in seconds
            if file_length_s > YT_LENGTH_LIMIT_S:
                raise gr.Error(f"Maximum YouTube length is 1 hour. Your video is {file_length_s // 3600}h {file_length_s % 3600 // 60}m {file_length_s % 60}s.")
            ydl.download([yt_url])
        except youtube_dl.utils.DownloadError as err:
            raise gr.Error(str(err))

# YouTube transcription function
def yt_transcribe(yt_url, task, max_filesize=75.0):
    html_embed_str = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "audio.mp3")
        download_yt_audio(yt_url, filepath)

        if os.path.getsize(filepath) > max_filesize * 1024 * 1024:
            raise gr.Error(f"File too large! Max allowed size is {max_filesize}MB.")

        with open(filepath, "rb") as f:
            inputs = ffmpeg_read(f.read(), pipe.feature_extractor.sampling_rate)

    inputs = {
        "array": inputs,
        "sampling_rate": pipe.feature_extractor.sampling_rate,
        "attention_mask": torch.ones(len(inputs), dtype=torch.long),
    }

    text = pipe(
        {"input_features": inputs}, 
        batch_size=BATCH_SIZE, 
        generate_kwargs={"task": task, "forced_decoder_ids": None}, 
        return_timestamps=True
    )["text"]

    return html_embed_str, text

# Gradio UI
demo = gr.Blocks()

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task"),
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe Audio",
    description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
    flagging_mode="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste YouTube URL here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task")
    ],
    outputs=["html", "text"],
    title="Whisper Large V3: Transcribe YouTube",
    description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
    flagging_mode="never",
)

with demo:
    gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])

demo.launch()