AhmadHazem commited on
Commit
d914cf2
·
verified ·
1 Parent(s): 531134a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -159
app.py CHANGED
@@ -1,159 +1,140 @@
1
- import time
2
- import spaces
3
- import torch
4
-
5
- import gradio as gr
6
- import yt_dlp as youtube_dl
7
- from transformers import pipeline, MarianMTModel, MarianTokenizer
8
- from transformers.pipelines.audio_utils import ffmpeg_read
9
-
10
- import tempfile
11
- import os
12
-
13
- MODEL_NAME = "openai/whisper-large-v3-turbo"
14
- BATCH_SIZE = 8
15
- FILE_LIMIT_MB = 1000
16
- YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
17
-
18
- device = 0 if torch.cuda.is_available() else "cpu"
19
-
20
- pipe = pipeline(
21
- task="automatic-speech-recognition",
22
- model=MODEL_NAME,
23
- chunk_length_s=30,
24
- device=device,
25
- )
26
-
27
- model_name_translate = "Helsinki-NLP/opus-mt-en-ar"
28
- tokenizer_translation = MarianTokenizer.from_pretrained(model_name_translate)
29
- model_translate = MarianMTModel.from_pretrained(model_name_translate)
30
-
31
-
32
- def translate(sentence):
33
- batch = tokenizer_translation([sentence], return_tensors="pt")
34
- generated_ids = model_translate.generate(batch["input_ids"])
35
- text = tokenizer_translation.batch_decode(generated_ids, skip_special_tokens=True)[0]
36
- return text
37
-
38
- @spaces.GPU
39
- def transcribe(inputs, task):
40
- if inputs is None:
41
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
42
-
43
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
44
- text = translate(text)
45
- return text
46
-
47
-
48
- def _return_yt_html_embed(yt_url):
49
- video_id = yt_url.split("?v=")[-1]
50
- HTML_str = (
51
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
52
- " </center>"
53
- )
54
- return HTML_str
55
-
56
- def download_yt_audio(yt_url, filename):
57
- info_loader = youtube_dl.YoutubeDL()
58
-
59
- try:
60
- info = info_loader.extract_info(yt_url, download=False)
61
- except youtube_dl.utils.DownloadError as err:
62
- raise gr.Error(str(err))
63
-
64
- file_length = info["duration_string"]
65
- file_h_m_s = file_length.split(":")
66
- file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
67
-
68
- if len(file_h_m_s) == 1:
69
- file_h_m_s.insert(0, 0)
70
- if len(file_h_m_s) == 2:
71
- file_h_m_s.insert(0, 0)
72
- file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
73
-
74
- if file_length_s > YT_LENGTH_LIMIT_S:
75
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
76
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
77
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
78
-
79
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
80
-
81
- with youtube_dl.YoutubeDL(ydl_opts) as ydl:
82
- try:
83
- ydl.download([yt_url])
84
- except youtube_dl.utils.ExtractorError as err:
85
- raise gr.Error(str(err))
86
-
87
- @spaces.GPU
88
- def yt_transcribe(yt_url, task, max_filesize=75.0):
89
- html_embed_str = _return_yt_html_embed(yt_url)
90
-
91
- with tempfile.TemporaryDirectory() as tmpdirname:
92
- filepath = os.path.join(tmpdirname, "video.mp4")
93
- download_yt_audio(yt_url, filepath)
94
- with open(filepath, "rb") as f:
95
- inputs = f.read()
96
-
97
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
98
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
99
-
100
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
101
- text = translate(text)
102
- return html_embed_str, text
103
-
104
-
105
- demo = gr.Blocks(theme=gr.themes.Ocean())
106
-
107
- mf_transcribe = gr.Interface(
108
- fn=transcribe,
109
- inputs=[
110
- gr.Audio(sources="microphone", type="filepath"),
111
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
112
- ],
113
- outputs="text",
114
- title="Whisper Large V3 Turbo: Transcribe Audio",
115
- description=(
116
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
117
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
118
- " of arbitrary length."
119
- ),
120
- allow_flagging="never",
121
- )
122
-
123
- file_transcribe = gr.Interface(
124
- fn=transcribe,
125
- inputs=[
126
- gr.Audio(sources="upload", type="filepath", label="Audio file"),
127
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
128
- ],
129
- outputs="text",
130
- title="Whisper Large V3: Transcribe Audio",
131
- description=(
132
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
133
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
134
- " of arbitrary length."
135
- ),
136
- allow_flagging="never",
137
- )
138
-
139
- yt_transcribe = gr.Interface(
140
- fn=yt_transcribe,
141
- inputs=[
142
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
143
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
144
- ],
145
- outputs=["html", "text"],
146
- title="Whisper Large V3: Transcribe YouTube",
147
- description=(
148
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
149
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
150
- " arbitrary length."
151
- ),
152
- allow_flagging="never",
153
- )
154
-
155
- with demo:
156
- gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
157
-
158
- demo.queue().launch(ssr_mode=False)
159
-
 
1
+ import time
2
+ import spaces
3
+ import torch
4
+
5
+ import gradio as gr
6
+ import yt_dlp as youtube_dl
7
+ from transformers import pipeline, MarianMTModel, MarianTokenizer
8
+ from transformers.pipelines.audio_utils import ffmpeg_read
9
+
10
+ import tempfile
11
+ import os
12
+
13
+ MODEL_NAME = "openai/whisper-large-v3-turbo"
14
+ BATCH_SIZE = 8
15
+ FILE_LIMIT_MB = 1000
16
+ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
17
+
18
+ device = 0 if torch.cuda.is_available() else "cpu"
19
+
20
+ pipe = pipeline(
21
+ task="automatic-speech-recognition",
22
+ model=MODEL_NAME,
23
+ chunk_length_s=30,
24
+ device=device,
25
+ )
26
+
27
+ model_name_translate = "Helsinki-NLP/opus-mt-en-ar"
28
+ tokenizer_translation = MarianTokenizer.from_pretrained(model_name_translate)
29
+ model_translate = MarianMTModel.from_pretrained(model_name_translate)
30
+
31
+ @spaces.GPU
32
+ def translate(sentence):
33
+ batch = tokenizer_translation([sentence], return_tensors="pt")
34
+ generated_ids = model_translate.generate(batch["input_ids"])
35
+ text = tokenizer_translation.batch_decode(generated_ids, skip_special_tokens=True)[0]
36
+ return text
37
+
38
+ @spaces.GPU
39
+ def transcribe(inputs, task):
40
+ if inputs is None:
41
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
42
+
43
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
44
+ text = translate(text)
45
+ return text
46
+
47
+
48
+ def _return_yt_html_embed(yt_url):
49
+ video_id = yt_url.split("?v=")[-1]
50
+ HTML_str = (
51
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
52
+ " </center>"
53
+ )
54
+ return HTML_str
55
+
56
+ def download_yt_audio(yt_url, filename):
57
+ info_loader = youtube_dl.YoutubeDL()
58
+
59
+ try:
60
+ info = info_loader.extract_info(yt_url, download=False)
61
+ except youtube_dl.utils.DownloadError as err:
62
+ raise gr.Error(str(err))
63
+
64
+ file_length = info["duration_string"]
65
+ file_h_m_s = file_length.split(":")
66
+ file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
67
+
68
+ if len(file_h_m_s) == 1:
69
+ file_h_m_s.insert(0, 0)
70
+ if len(file_h_m_s) == 2:
71
+ file_h_m_s.insert(0, 0)
72
+ file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
73
+
74
+ if file_length_s > YT_LENGTH_LIMIT_S:
75
+ yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
76
+ file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
77
+ raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
78
+
79
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
80
+
81
+ with youtube_dl.YoutubeDL(ydl_opts) as ydl:
82
+ try:
83
+ ydl.download([yt_url])
84
+ except youtube_dl.utils.ExtractorError as err:
85
+ raise gr.Error(str(err))
86
+
87
+ @spaces.GPU
88
+ def yt_transcribe(yt_url, task, max_filesize=75.0):
89
+ html_embed_str = _return_yt_html_embed(yt_url)
90
+
91
+ with tempfile.TemporaryDirectory() as tmpdirname:
92
+ filepath = os.path.join(tmpdirname, "video.mp4")
93
+ download_yt_audio(yt_url, filepath)
94
+ with open(filepath, "rb") as f:
95
+ inputs = f.read()
96
+
97
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
98
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
99
+
100
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
101
+ text = translate(text)
102
+ return html_embed_str, text
103
+
104
+
105
+ demo = gr.Blocks(theme=gr.themes.Ocean())
106
+
107
+ mf_transcribe = gr.Interface(
108
+ fn=transcribe,
109
+ inputs=[
110
+ gr.Audio(sources="microphone", type="filepath"),
111
+ ],
112
+ outputs="text",
113
+ title="Real-Time Speech Translation From English to Arabic",
114
+ description=(
115
+ "Real Time Speech Translation Model from English to Arabic. This model uses the Whisper For speech to generation"
116
+ "then Helensiki model fine tuned on a translation dataset for translation"
117
+ ),
118
+ allow_flagging="never",
119
+ )
120
+
121
+ file_transcribe = gr.Interface(
122
+ fn=transcribe,
123
+ inputs=[
124
+ gr.Audio(sources="upload", type="filepath", label="Audio file"),
125
+ ],
126
+ outputs="text",
127
+ title="Real-Time Speech Translation From English to Arabic",
128
+ description=(
129
+ "Real Time Speech Translation Model from English to Arabic. This model uses the Whisper For speech to generation"
130
+ "then Helensiki model fine tuned on a translation dataset for translation"
131
+ ),
132
+ allow_flagging="never",
133
+ )
134
+
135
+
136
+ with demo:
137
+ gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
138
+
139
+ demo.queue().launch(ssr_mode=False)
140
+