pere commited on
Commit
e1a5899
·
1 Parent(s): f4d4476

update test

Browse files
Files changed (1) hide show
  1. app.py +24 -20
app.py CHANGED
@@ -2,7 +2,7 @@ import time
2
  import os
3
 
4
  import torch
5
- import yt_dlp
6
  import gradio as gr
7
  import pytube as pt
8
  import spaces
@@ -25,9 +25,16 @@ print(f"Using device: {device}")
25
 
26
  @spaces.GPU(duration=60 * 2)
27
  def pipe(file, return_timestamps=False):
 
 
 
 
 
28
  asr = pipeline(
29
  task="automatic-speech-recognition",
30
  model=MODEL_NAME,
 
 
31
  chunk_length_s=30,
32
  device=device,
33
  token=auth_token,
@@ -39,6 +46,7 @@ def pipe(file, return_timestamps=False):
39
  task="transcribe",
40
  no_timestamps=not return_timestamps,
41
  )
 
42
  return asr(file, return_timestamps=return_timestamps, batch_size=24)
43
 
44
  def transcribe(file, return_timestamps=False):
@@ -65,18 +73,14 @@ def _return_yt_html_embed(yt_url):
65
  return HTML_str
66
 
67
 
68
- @spaces.GPU
69
- def yt_transcribe(yt_url, task):
70
  html_embed_str = _return_yt_html_embed(yt_url)
 
 
71
 
72
- with tempfile.TemporaryDirectory() as tmpdirname:
73
- filepath = os.path.join(tmpdirname, "audio.mp3")
74
- download_yt_audio(yt_url, filepath)
75
-
76
- inputs = ffmpeg_read(filepath, pipe.feature_extractor.sampling_rate)
77
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
78
 
79
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
80
  return html_embed_str, text
81
 
82
 
@@ -85,11 +89,11 @@ demo = gr.Blocks()
85
  mf_transcribe = gr.Interface(
86
  fn=transcribe,
87
  inputs=[
88
- gr.Audio(sources="microphone", type="filepath"),
89
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
90
  ],
91
  outputs="text",
92
- title="NB-Whisper Demo",
93
  description=(
94
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
95
  f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
@@ -101,27 +105,27 @@ mf_transcribe = gr.Interface(
101
  yt_transcribe = gr.Interface(
102
  fn=yt_transcribe,
103
  inputs=[
104
- gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
105
- gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
106
  ],
 
107
  outputs=["html", "text"],
108
  title="Whisper Demo: Transcribe YouTube",
109
  description=(
110
  "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
111
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
112
  " arbitrary length."
113
  ),
114
  allow_flagging="never",
115
  )
116
 
117
-
118
  with demo:
119
  gr.TabbedInterface([
120
  mf_transcribe,
121
- yt_transcribe
122
  ], [
123
- "Transkriber Lyd",
124
- "Transkriber YouTube"
125
  ])
126
 
127
  demo.launch(share=share).queue()
 
2
  import os
3
 
4
  import torch
5
+
6
  import gradio as gr
7
  import pytube as pt
8
  import spaces
 
25
 
26
  @spaces.GPU(duration=60 * 2)
27
  def pipe(file, return_timestamps=False):
28
+ # model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True)
29
+ # model.to(device)
30
+ # processor = WhisperProcessor.from_pretrained(MODEL_NAME)
31
+ # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
32
+ # model.generation_config.cache_implementation = "static"
33
  asr = pipeline(
34
  task="automatic-speech-recognition",
35
  model=MODEL_NAME,
36
+ # tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
37
+ # feature_extractor=AutoFeatureExtractor.from_pretrained(MODEL_NAME),
38
  chunk_length_s=30,
39
  device=device,
40
  token=auth_token,
 
46
  task="transcribe",
47
  no_timestamps=not return_timestamps,
48
  )
49
+ # asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
50
  return asr(file, return_timestamps=return_timestamps, batch_size=24)
51
 
52
  def transcribe(file, return_timestamps=False):
 
73
  return HTML_str
74
 
75
 
76
+ def yt_transcribe(yt_url, return_timestamps=False):
77
+ yt = pt.YouTube(yt_url)
78
  html_embed_str = _return_yt_html_embed(yt_url)
79
+ stream = yt.streams.filter(only_audio=True)[0]
80
+ stream.download(filename="audio.mp3")
81
 
82
+ text = transcribe("audio.mp3", return_timestamps=return_timestamps)
 
 
 
 
 
83
 
 
84
  return html_embed_str, text
85
 
86
 
 
89
  mf_transcribe = gr.Interface(
90
  fn=transcribe,
91
  inputs=[
92
+ gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
93
+ gr.components.Checkbox(label="Return timestamps"),
94
  ],
95
  outputs="text",
96
+ title="NB-Whisper",
97
  description=(
98
  "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
99
  f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
 
105
  yt_transcribe = gr.Interface(
106
  fn=yt_transcribe,
107
  inputs=[
108
+ gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
109
+ gr.components.Checkbox(label="Return timestamps"),
110
  ],
111
+ examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
112
  outputs=["html", "text"],
113
  title="Whisper Demo: Transcribe YouTube",
114
  description=(
115
  "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
116
+ f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
117
  " arbitrary length."
118
  ),
119
  allow_flagging="never",
120
  )
121
 
 
122
  with demo:
123
  gr.TabbedInterface([
124
  mf_transcribe,
125
+ # yt_transcribe
126
  ], [
127
+ "Transcribe Audio",
128
+ # "Transcribe YouTube"
129
  ])
130
 
131
  demo.launch(share=share).queue()