Spaces:

Ngoufack
/

verbalens

Running

App Files Files Community

Ngoufack commited on Mar 3

Commit

e500a6a

1 Parent(s): 4e80514

mis a jour

Browse files

Files changed (2) hide show

app.py +19 -12
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import spaces
 import torch
 import gradio as gr
 import yt_dlp as youtube_dl
@@ -9,7 +10,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
-MODEL_NAME = "openai/whisper-medium"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 600  # limit to 1 hour YouTube files
@@ -23,14 +24,17 @@ pipe = pipeline(
     device=device,
 )
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return  text
 def _return_yt_html_embed(yt_url):
@@ -64,7 +68,10 @@ def download_yt_audio(yt_url, filename):
         file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
         raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
-    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         try:
@@ -77,17 +84,17 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
-        filepath = os.path.join(tmpdirname, "video.mp4")
         download_yt_audio(yt_url, filepath)
-        with open(filepath, "rb") as f:
-            inputs = f.read()
-    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
-    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return html_embed_str, text
 demo = gr.Blocks()

 import spaces
 import torch
+from faster_whisper import WhisperModel
 import gradio as gr
 import yt_dlp as youtube_dl
 import tempfile
 import os
+MODEL_NAME = "large-v3"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
 YT_LENGTH_LIMIT_S = 600  # limit to 1 hour YouTube files
     device=device,
 )
+model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if device == "cuda" else "int8")
 @spaces.GPU
 def transcribe(inputs, task):
     if inputs is None:
         raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    segments, info = model.transcribe(input, beam_size=5,batch_size=BATCH_SIZE, vad_filter=True, word_timestamps=False)
+    transcription = " ".join([segment.text for segment in segments])
+    #text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+    return  transcription
 def _return_yt_html_embed(yt_url):
         file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
         raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
+    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+        }]}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         try:
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
+        filepath = os.path.join(tmpdirname, "video.mp3")
         download_yt_audio(yt_url, filepath)
+    #inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
+    #inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    #text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
+    segments, info = model.transcribe(filepath, beam_size=5,batch_size=BATCH_SIZE, vad_filter=True, word_timestamps=False)
+    transcription = " ".join([segment.text for segment in segments])
+    return html_embed_str, transcription
 demo = gr.Blocks()

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ yt-dlp
 torch
 torchvision
 torchaudio
-nemo_toolkit

 torch
 torchvision
 torchaudio
+nemo_toolkit
+faster-whisper