pere commited on
Commit
99d9b3e
·
1 Parent(s): ac1dacb

update test

Browse files
Files changed (1) hide show
  1. app.py +21 -25
app.py CHANGED
@@ -4,7 +4,6 @@ import os
4
  import torch
5
 
6
  import gradio as gr
7
- import pytube as pt
8
  import spaces
9
  from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
10
  from huggingface_hub import model_info
@@ -14,6 +13,7 @@ try:
14
  except ImportError:
15
  FLASH_ATTENTION = False
16
 
 
17
 
18
  MODEL_NAME = "NbAiLab/nb-whisper-large"
19
  lang = "no"
@@ -25,16 +25,9 @@ print(f"Using device: {device}")
25
 
26
  @spaces.GPU(duration=60 * 2)
27
  def pipe(file, return_timestamps=False):
28
- # model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True)
29
- # model.to(device)
30
- # processor = WhisperProcessor.from_pretrained(MODEL_NAME)
31
- # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
32
- # model.generation_config.cache_implementation = "static"
33
  asr = pipeline(
34
  task="automatic-speech-recognition",
35
  model=MODEL_NAME,
36
- # tokenizer=AutoTokenizer.from_pretrained(MODEL_NAME),
37
- # feature_extractor=AutoFeatureExtractor.from_pretrained(MODEL_NAME),
38
  chunk_length_s=30,
39
  device=device,
40
  token=auth_token,
@@ -46,7 +39,6 @@ def pipe(file, return_timestamps=False):
46
  task="transcribe",
47
  no_timestamps=not return_timestamps,
48
  )
49
- # asr.model.config.no_timestamps_token_id = asr.tokenizer.encode("<|notimestamps|>", add_special_tokens=False)[0]
50
  return asr(file, return_timestamps=return_timestamps, batch_size=24)
51
 
52
  def transcribe(file, return_timestamps=False):
@@ -63,7 +55,6 @@ def transcribe(file, return_timestamps=False):
63
  text = "\n".join(text)
64
  return text
65
 
66
-
67
  def _return_yt_html_embed(yt_url):
68
  video_id = yt_url.split("?v=")[-1]
69
  HTML_str = (
@@ -72,18 +63,26 @@ def _return_yt_html_embed(yt_url):
72
  )
73
  return HTML_str
74
 
75
-
76
  def yt_transcribe(yt_url, return_timestamps=False):
77
- yt = pt.YouTube(yt_url)
78
  html_embed_str = _return_yt_html_embed(yt_url)
79
- stream = yt.streams.filter(only_audio=True)[0]
80
- stream.download(filename="audio.mp3")
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  text = transcribe("audio.mp3", return_timestamps=return_timestamps)
83
 
84
  return html_embed_str, text
85
 
86
-
87
  demo = gr.Blocks()
88
 
89
  mf_transcribe = gr.Interface(
@@ -102,7 +101,7 @@ mf_transcribe = gr.Interface(
102
  allow_flagging="never",
103
  )
104
 
105
- yt_transcribe = gr.Interface(
106
  fn=yt_transcribe,
107
  inputs=[
108
  gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
@@ -120,12 +119,9 @@ yt_transcribe = gr.Interface(
120
  )
121
 
122
  with demo:
123
- gr.TabbedInterface([
124
- mf_transcribe,
125
- yt_transcribe
126
- ], [
127
- "Transcribe Audio",
128
- "Transcribe YouTube"
129
- ])
130
-
131
- demo.launch(share=share).queue()
 
4
  import torch
5
 
6
  import gradio as gr
 
7
  import spaces
8
  from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
9
  from huggingface_hub import model_info
 
13
  except ImportError:
14
  FLASH_ATTENTION = False
15
 
16
+ import yt_dlp # Added import for yt-dlp
17
 
18
  MODEL_NAME = "NbAiLab/nb-whisper-large"
19
  lang = "no"
 
25
 
26
  @spaces.GPU(duration=60 * 2)
27
  def pipe(file, return_timestamps=False):
 
 
 
 
 
28
  asr = pipeline(
29
  task="automatic-speech-recognition",
30
  model=MODEL_NAME,
 
 
31
  chunk_length_s=30,
32
  device=device,
33
  token=auth_token,
 
39
  task="transcribe",
40
  no_timestamps=not return_timestamps,
41
  )
 
42
  return asr(file, return_timestamps=return_timestamps, batch_size=24)
43
 
44
  def transcribe(file, return_timestamps=False):
 
55
  text = "\n".join(text)
56
  return text
57
 
 
58
  def _return_yt_html_embed(yt_url):
59
  video_id = yt_url.split("?v=")[-1]
60
  HTML_str = (
 
63
  )
64
  return HTML_str
65
 
 
66
  def yt_transcribe(yt_url, return_timestamps=False):
 
67
  html_embed_str = _return_yt_html_embed(yt_url)
68
+
69
+ ydl_opts = {
70
+ 'format': 'bestaudio/best',
71
+ 'outtmpl': 'audio.%(ext)s',
72
+ 'postprocessors': [{
73
+ 'key': 'FFmpegExtractAudio',
74
+ 'preferredcodec': 'mp3',
75
+ 'preferredquality': '192',
76
+ }],
77
+ 'quiet': True,
78
+ }
79
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
80
+ ydl.download([yt_url])
81
 
82
  text = transcribe("audio.mp3", return_timestamps=return_timestamps)
83
 
84
  return html_embed_str, text
85
 
 
86
  demo = gr.Blocks()
87
 
88
  mf_transcribe = gr.Interface(
 
101
  allow_flagging="never",
102
  )
103
 
104
+ yt_transcribe_interface = gr.Interface(
105
  fn=yt_transcribe,
106
  inputs=[
107
  gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
 
119
  )
120
 
121
  with demo:
122
+ gr.TabbedInterface(
123
+ [mf_transcribe, yt_transcribe_interface],
124
+ ["Transcribe Audio", "Transcribe YouTube"]
125
+ )
126
+
127
+ demo.launch(share=share).queue()