mrmuminov commited on
Commit
bce555f
·
verified ·
1 Parent(s): a5bf333

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -71
app.py CHANGED
@@ -1,21 +1,24 @@
1
  import torch
2
-
3
  import gradio as gr
4
  import yt_dlp as youtube_dl
5
  from transformers import pipeline
6
  from transformers.pipelines.audio_utils import ffmpeg_read
 
7
 
8
  import tempfile
9
  import time
10
  import os
11
 
 
12
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
13
  BATCH_SIZE = 8
14
  FILE_LIMIT_MB = 1000
15
- YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
16
 
 
17
  device = 0 if torch.cuda.is_available() else "cpu"
18
 
 
19
  pipe = pipeline(
20
  task="automatic-speech-recognition",
21
  model=MODEL_NAME,
@@ -23,89 +26,84 @@ pipe = pipeline(
23
  device=device,
24
  )
25
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
27
  def transcribe(inputs, task):
28
  if inputs is None:
29
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
30
 
31
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
32
- return text
33
-
34
-
35
- def _return_yt_html_embed(yt_url):
36
- video_id = yt_url.split("?v=")[-1]
37
- HTML_str = (
38
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
39
- " </center>"
40
- )
41
- return HTML_str
42
 
 
43
  def download_yt_audio(yt_url, filename):
44
- info_loader = youtube_dl.YoutubeDL()
45
-
46
- try:
47
- info = info_loader.extract_info(yt_url, download=False)
48
- except youtube_dl.utils.DownloadError as err:
49
- raise gr.Error(str(err))
50
-
51
- file_length = info["duration_string"]
52
- file_h_m_s = file_length.split(":")
53
- file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
54
-
55
- if len(file_h_m_s) == 1:
56
- file_h_m_s.insert(0, 0)
57
- if len(file_h_m_s) == 2:
58
- file_h_m_s.insert(0, 0)
59
- file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
60
-
61
- if file_length_s > YT_LENGTH_LIMIT_S:
62
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
63
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
64
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
65
-
66
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
67
 
68
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
69
  try:
 
 
 
 
70
  ydl.download([yt_url])
71
- except youtube_dl.utils.ExtractorError as err:
72
  raise gr.Error(str(err))
73
 
74
-
75
  def yt_transcribe(yt_url, task, max_filesize=75.0):
76
  html_embed_str = _return_yt_html_embed(yt_url)
77
 
78
  with tempfile.TemporaryDirectory() as tmpdirname:
79
- filepath = os.path.join(tmpdirname, "video.mp4")
80
  download_yt_audio(yt_url, filepath)
 
 
 
 
81
  with open(filepath, "rb") as f:
82
- inputs = f.read()
83
 
84
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
85
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
 
 
 
86
 
87
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
 
 
 
 
 
88
 
89
  return html_embed_str, text
90
 
91
-
92
  demo = gr.Blocks()
93
 
94
- mf_transcribe = gr.Interface(
95
- fn=transcribe,
96
- inputs=[
97
- gr.Audio(type="filepath"),
98
- gr.Radio(["transcribe", "translate"], label="Task"),
99
- ],
100
- outputs="text",
101
- theme="huggingface",
102
- title="Whisper Large V3: Transcribe Audio",
103
- description=(
104
- "Whisper Large V3 fine-tuned for Uzbek language by Dataprizma"
105
- ),
106
- allow_flagging="never",
107
- )
108
-
109
  file_transcribe = gr.Interface(
110
  fn=transcribe,
111
  inputs=[
@@ -113,30 +111,24 @@ file_transcribe = gr.Interface(
113
  gr.Radio(["transcribe", "translate"], label="Task"),
114
  ],
115
  outputs="text",
116
- theme="huggingface",
117
  title="Whisper Large V3: Transcribe Audio",
118
- description=(
119
- "Whisper Large V3 fine-tuned for Uzbek language by Dataprizma"
120
- ),
121
- allow_flagging="never",
122
  )
123
 
124
  yt_transcribe = gr.Interface(
125
  fn=yt_transcribe,
126
  inputs=[
127
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
128
  gr.Radio(["transcribe", "translate"], label="Task")
129
  ],
130
  outputs=["html", "text"],
131
- theme="huggingface",
132
  title="Whisper Large V3: Transcribe YouTube",
133
- description=(
134
- "Whisper Large V3 fine-tuned for Uzbek language by Dataprizma"
135
- ),
136
- allow_flagging="never",
137
  )
138
 
139
  with demo:
140
- gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
141
 
142
  demo.launch()
 
1
  import torch
 
2
  import gradio as gr
3
  import yt_dlp as youtube_dl
4
  from transformers import pipeline
5
  from transformers.pipelines.audio_utils import ffmpeg_read
6
+ from urllib.parse import urlparse, parse_qs
7
 
8
  import tempfile
9
  import time
10
  import os
11
 
12
+ # Constants
13
  MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
14
  BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
+ YT_LENGTH_LIMIT_S = 3600 # 1 hour limit
17
 
18
+ # Device selection
19
  device = 0 if torch.cuda.is_available() else "cpu"
20
 
21
+ # Load Whisper pipeline
22
  pipe = pipeline(
23
  task="automatic-speech-recognition",
24
  model=MODEL_NAME,
 
26
  device=device,
27
  )
28
 
29
+ # Extract YouTube Video ID
30
+ def _extract_yt_video_id(yt_url):
31
+ parsed_url = urlparse(yt_url)
32
+ return parse_qs(parsed_url.query).get("v", [""])[0]
33
+
34
+ # Embed YouTube Video in HTML
35
+ def _return_yt_html_embed(yt_url):
36
+ video_id = _extract_yt_video_id(yt_url)
37
+ if not video_id:
38
+ raise gr.Error("Invalid YouTube URL. Please check and try again.")
39
+ return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe> </center>'
40
 
41
+ # Transcription function
42
  def transcribe(inputs, task):
43
  if inputs is None:
44
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
45
 
46
+ text = pipe(
47
+ {"input_features": inputs},
48
+ batch_size=BATCH_SIZE,
49
+ generate_kwargs={"task": task, "forced_decoder_ids": None},
50
+ return_timestamps=True
51
+ )["text"]
52
+
53
+ return text
 
 
 
54
 
55
+ # Download YouTube audio
56
  def download_yt_audio(yt_url, filename):
57
+ ydl_opts = {
58
+ "format": "bestaudio/best",
59
+ "outtmpl": filename,
60
+ "postprocessors": [
61
+ {"key": "FFmpegExtractAudio", "preferredcodec": "mp3", "preferredquality": "192"}
62
+ ],
63
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
66
  try:
67
+ info = ydl.extract_info(yt_url, download=False)
68
+ file_length_s = info.get("duration", 0) # Duration in seconds
69
+ if file_length_s > YT_LENGTH_LIMIT_S:
70
+ raise gr.Error(f"Maximum YouTube length is 1 hour. Your video is {file_length_s // 3600}h {file_length_s % 3600 // 60}m {file_length_s % 60}s.")
71
  ydl.download([yt_url])
72
+ except youtube_dl.utils.DownloadError as err:
73
  raise gr.Error(str(err))
74
 
75
+ # YouTube transcription function
76
  def yt_transcribe(yt_url, task, max_filesize=75.0):
77
  html_embed_str = _return_yt_html_embed(yt_url)
78
 
79
  with tempfile.TemporaryDirectory() as tmpdirname:
80
+ filepath = os.path.join(tmpdirname, "audio.mp3")
81
  download_yt_audio(yt_url, filepath)
82
+
83
+ if os.path.getsize(filepath) > max_filesize * 1024 * 1024:
84
+ raise gr.Error(f"File too large! Max allowed size is {max_filesize}MB.")
85
+
86
  with open(filepath, "rb") as f:
87
+ inputs = ffmpeg_read(f.read(), pipe.feature_extractor.sampling_rate)
88
 
89
+ inputs = {
90
+ "array": inputs,
91
+ "sampling_rate": pipe.feature_extractor.sampling_rate,
92
+ "attention_mask": torch.ones(len(inputs), dtype=torch.long),
93
+ }
94
 
95
+ text = pipe(
96
+ {"input_features": inputs},
97
+ batch_size=BATCH_SIZE,
98
+ generate_kwargs={"task": task, "forced_decoder_ids": None},
99
+ return_timestamps=True
100
+ )["text"]
101
 
102
  return html_embed_str, text
103
 
104
+ # Gradio UI
105
  demo = gr.Blocks()
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  file_transcribe = gr.Interface(
108
  fn=transcribe,
109
  inputs=[
 
111
  gr.Radio(["transcribe", "translate"], label="Task"),
112
  ],
113
  outputs="text",
 
114
  title="Whisper Large V3: Transcribe Audio",
115
+ description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
116
+ flagging_mode="never",
 
 
117
  )
118
 
119
  yt_transcribe = gr.Interface(
120
  fn=yt_transcribe,
121
  inputs=[
122
+ gr.Textbox(lines=1, placeholder="Paste YouTube URL here", label="YouTube URL"),
123
  gr.Radio(["transcribe", "translate"], label="Task")
124
  ],
125
  outputs=["html", "text"],
 
126
  title="Whisper Large V3: Transcribe YouTube",
127
+ description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
128
+ flagging_mode="never",
 
 
129
  )
130
 
131
  with demo:
132
+ gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])
133
 
134
  demo.launch()