Ngoufack commited on
Commit
cca4b72
Β·
1 Parent(s): ee3af18

hotfix 2.0

Browse files
Files changed (2) hide show
  1. README.md +2 -4
  2. app.py +35 -33
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
  title: Verbalens
3
  emoji: πŸ“š
4
- colorFrom: purple
5
- colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.20.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: this is a first prototype of verbalens using whisper and nem
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Verbalens
3
  emoji: πŸ“š
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.20.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: this is a first prototype of verbalens using whisper and nem
11
  ---
 
 
app.py CHANGED
@@ -1,33 +1,36 @@
1
  import spaces
2
  import torch
3
- from faster_whisper import WhisperModel
4
 
5
  import gradio as gr
6
  import yt_dlp as youtube_dl
7
-
8
  from transformers.pipelines.audio_utils import ffmpeg_read
9
 
10
  import tempfile
11
  import os
12
 
13
- MODEL_NAME = "large-v3"
14
  BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
- YT_LENGTH_LIMIT_S = 600 # limit to 1 hour YouTube files
17
 
18
  device = 0 if torch.cuda.is_available() else "cpu"
19
 
20
- model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if device == "cuda" else "int8")
 
 
 
 
 
 
21
 
22
  @spaces.GPU
23
  def transcribe(inputs, task):
24
  if inputs is None:
25
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
26
- segments, info = model.transcribe(input, beam_size=5, vad_filter=True, word_timestamps=False)
27
- transcription = " ".join([segment.text for segment in segments])
28
-
29
- #text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
30
- return transcription
31
 
32
 
33
  def _return_yt_html_embed(yt_url):
@@ -61,10 +64,7 @@ def download_yt_audio(yt_url, filename):
61
  file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
62
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
63
 
64
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",'postprocessors': [{
65
- 'key': 'FFmpegExtractAudio',
66
- 'preferredcodec': 'mp3',
67
- }]}
68
 
69
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
70
  try:
@@ -77,32 +77,32 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
77
  html_embed_str = _return_yt_html_embed(yt_url)
78
 
79
  with tempfile.TemporaryDirectory() as tmpdirname:
80
- filepath = os.path.join(tmpdirname, "audio.mp3")
81
  download_yt_audio(yt_url, filepath)
 
 
82
 
83
- #inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
84
- #inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
85
 
86
- #text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
87
- segments, info = model.transcribe(filepath, beam_size=5, vad_filter=True, word_timestamps=False)
88
- transcription = " ".join([segment.text for segment in segments])
89
 
90
- return html_embed_str, transcription
91
 
92
 
93
- demo = gr.Blocks()
94
 
95
  mf_transcribe = gr.Interface(
96
  fn=transcribe,
97
  inputs=[
98
  gr.Audio(sources="microphone", type="filepath"),
99
- gr.Radio(["transcribe", "analysis"], label="Task", value="transcribe"),
100
  ],
101
  outputs="text",
102
- title="VerbaLens Project: Demo 1",
103
  description=(
104
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
105
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}), Nemo diarization and Transformers to transcribe audio files"
106
  " of arbitrary length."
107
  ),
108
  allow_flagging="never",
@@ -115,11 +115,11 @@ file_transcribe = gr.Interface(
115
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
116
  ],
117
  outputs="text",
118
- title="Verbalens Project: Demo 1 prototype",
119
  description=(
120
- "Transcribe long-form videos with the click of a button! Demo uses the checkpoint"
121
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}), Nemo Diarization and Transformers to transcribe video files of"
122
- " arbitrary length."
123
  ),
124
  allow_flagging="never",
125
  )
@@ -128,12 +128,14 @@ yt_transcribe = gr.Interface(
128
  fn=yt_transcribe,
129
  inputs=[
130
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
131
- gr.Radio(["transcribe", "analysis"], label="Task", value="transcribe")
132
  ],
133
  outputs=["html", "text"],
134
- title="Verbalens Project: Demo 1 prototype",
135
  description=(
136
- "Transcribe long-form videos with the click of a button! Demo uses the checkpoint Nemo Diarization and Transformers to transcribe video files of arbitrary length."
 
 
137
  ),
138
  allow_flagging="never",
139
  )
@@ -141,4 +143,4 @@ yt_transcribe = gr.Interface(
141
  with demo:
142
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
143
 
144
- demo.queue().launch()
 
1
  import spaces
2
  import torch
 
3
 
4
  import gradio as gr
5
  import yt_dlp as youtube_dl
6
+ from transformers import pipeline
7
  from transformers.pipelines.audio_utils import ffmpeg_read
8
 
9
  import tempfile
10
  import os
11
 
12
+ MODEL_NAME = "openai/whisper-large-v3-turbo"
13
  BATCH_SIZE = 8
14
  FILE_LIMIT_MB = 1000
15
+ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
16
 
17
  device = 0 if torch.cuda.is_available() else "cpu"
18
 
19
+ pipe = pipeline(
20
+ task="automatic-speech-recognition",
21
+ model=MODEL_NAME,
22
+ chunk_length_s=30,
23
+ device=device,
24
+ )
25
+
26
 
27
  @spaces.GPU
28
  def transcribe(inputs, task):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
31
+
32
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
33
+ return text
 
 
34
 
35
 
36
  def _return_yt_html_embed(yt_url):
 
64
  file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
65
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
66
 
67
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
 
 
 
68
 
69
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
70
  try:
 
77
  html_embed_str = _return_yt_html_embed(yt_url)
78
 
79
  with tempfile.TemporaryDirectory() as tmpdirname:
80
+ filepath = os.path.join(tmpdirname, "video.mp4")
81
  download_yt_audio(yt_url, filepath)
82
+ with open(filepath, "rb") as f:
83
+ inputs = f.read()
84
 
85
+ inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
86
+ inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
87
 
88
+ text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
 
 
89
 
90
+ return html_embed_str, text
91
 
92
 
93
+ demo = gr.Blocks(theme=gr.themes.Ocean())
94
 
95
  mf_transcribe = gr.Interface(
96
  fn=transcribe,
97
  inputs=[
98
  gr.Audio(sources="microphone", type="filepath"),
99
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
100
  ],
101
  outputs="text",
102
+ title="Whisper Large V3 Turbo: Transcribe Audio",
103
  description=(
104
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
105
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
106
  " of arbitrary length."
107
  ),
108
  allow_flagging="never",
 
115
  gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
116
  ],
117
  outputs="text",
118
+ title="Whisper Large V3: Transcribe Audio",
119
  description=(
120
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
121
+ f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
122
+ " of arbitrary length."
123
  ),
124
  allow_flagging="never",
125
  )
 
128
  fn=yt_transcribe,
129
  inputs=[
130
  gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
131
+ gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
132
  ],
133
  outputs=["html", "text"],
134
+ title="Whisper Large V3: Transcribe YouTube",
135
  description=(
136
+ "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
137
+ f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe video files of"
138
+ " arbitrary length."
139
  ),
140
  allow_flagging="never",
141
  )
 
143
  with demo:
144
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
145
 
146
+ demo.queue().launch(ssr_mode=False)