bayartsogt commited on
Commit
ffa1767
·
1 Parent(s): 7a1ec76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -83
app.py CHANGED
@@ -1,102 +1,111 @@
1
- import torch
2
-
3
  import gradio as gr
4
- import pytube as pt
 
 
 
5
  from transformers import pipeline
6
- from huggingface_hub import model_info
7
-
8
- MODEL_NAME = "bayartsogt/whisper-small-mn-12" #this always needs to stay in line 8 :D sorry for the hackiness
9
- lang = "mn"
10
-
11
- """
12
- | model_id | WER | Keep Characters |
13
- | bayartsogt/whisper-small-mn-7 | 32.6469 | " абвгдеёжзийклмноөпрстуүфхцчшъыьэюя.,?!" |
14
- """
15
 
 
 
16
  device = 0 if torch.cuda.is_available() else "cpu"
17
- pipe = pipeline(
18
- task="automatic-speech-recognition",
19
- model=MODEL_NAME,
20
- chunk_length_s=12, # did a little experiment looks like this is much better
21
- device=device,
22
- )
23
 
 
 
 
 
24
  pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
25
 
26
- def transcribe(microphone, file_upload):
27
- warn_output = ""
28
- if (microphone is not None) and (file_upload is not None):
29
- warn_output = (
30
- "WARNING: You've uploaded an audio file and used the microphone. "
31
- "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
32
- )
33
-
34
- elif (microphone is None) and (file_upload is None):
35
- return "ERROR: You have to either use the microphone or upload an audio file"
36
-
37
- file = microphone if microphone is not None else file_upload
38
-
39
- text = pipe(file)["text"]
40
-
41
- return warn_output + text
42
-
43
-
44
- def _return_yt_html_embed(yt_url):
45
- video_id = yt_url.split("?v=")[-1]
46
- HTML_str = (
47
- f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
48
- " </center>"
49
- )
50
- return HTML_str
51
-
52
-
53
- def yt_transcribe(yt_url):
54
- yt = pt.YouTube(yt_url)
55
- html_embed_str = _return_yt_html_embed(yt_url)
56
- stream = yt.streams.filter(only_audio=True)[0]
57
- stream.download(filename="audio.mp3")
58
-
59
- text = pipe("audio.mp3")["text"]
60
-
61
- return html_embed_str, text
62
-
63
-
64
- demo = gr.Blocks()
65
-
66
- mf_transcribe = gr.Interface(
67
- fn=transcribe,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  inputs=[
 
69
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
70
  gr.inputs.Audio(source="upload", type="filepath", optional=True),
 
 
 
71
  ],
72
- outputs="text",
73
- layout="horizontal",
74
- theme="huggingface",
75
- title="Whisper Demo: Transcribe Audio",
76
- description=(
77
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
78
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
79
- " of arbitrary length."
80
- ),
81
- allow_flagging="never",
82
- )
83
-
84
- yt_transcribe = gr.Interface(
85
- fn=yt_transcribe,
86
- inputs=[gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL")],
87
- outputs=["html", "text"],
88
  layout="horizontal",
89
  theme="huggingface",
90
- title="Whisper Demo: Transcribe YouTube",
91
  description=(
92
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
93
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
94
- " arbitrary length."
 
 
 
95
  ),
96
  allow_flagging="never",
97
  )
98
 
99
- with demo:
100
- gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])
101
 
102
- demo.launch(enable_queue=True)
 
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import time
4
+ from pyannote.audio import Pipeline
5
+ import librosa, torch
6
  from transformers import pipeline
7
+ from utils import second_to_timecode, download_from_youtube
8
+ MODEL_NAME = 'bayartsogt/whisper-small-mn-8'
9
+ lang = 'mn'
 
 
 
 
 
 
10
 
11
+ chunk_length_s = 9
12
+ vad_activation_min_duration = 9 # sec
13
  device = 0 if torch.cuda.is_available() else "cpu"
14
+ SAMPLE_RATE = 16_000
 
 
 
 
 
15
 
16
+ ######## LOAD MODELS FROM HUB ########
17
+ dia_model = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=True)
18
+ vad_model = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=True)
19
+ pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=chunk_length_s, device=device)
20
  pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
21
 
22
+ print("----------> Loaded models <-----------")
23
+
24
+ def generator(youtube_link, microphone, file_upload, num_speakers, max_duration, history):
25
+
26
+ if int(youtube_link != '') + int(microphone is not None) + int(file_upload is not None) != 1:
27
+ raise Exception(f"Only one of the source should be given youtube_link={youtube_link}, microphone={microphone}, file_upload={file_upload}")
28
+
29
+ history = history or ""
30
+
31
+ if microphone:
32
+ path = microphone
33
+ elif file_upload:
34
+ path = file_upload
35
+ elif youtube_link:
36
+ path = download_from_youtube(youtube_link)
37
+
38
+ waveform, sampling_rate = librosa.load(path, sr=SAMPLE_RATE, mono=True, duration=max_duration)
39
+
40
+ print(waveform.shape, sampling_rate)
41
+ waveform_tensor = torch.unsqueeze(torch.tensor(waveform), 0).to(device)
42
+
43
+ dia_result = dia_model({
44
+ "waveform": waveform_tensor,
45
+ "sample_rate": sampling_rate,
46
+ }, num_speakers=num_speakers)
47
+
48
+ for speech_turn, track, speaker in dia_result.itertracks(yield_label=True):
49
+ print(f"{speech_turn.start:4.1f} {speech_turn.end:4.1f} {speaker}")
50
+ _start = int(sampling_rate * speech_turn.start)
51
+ _end = int(sampling_rate * speech_turn.end)
52
+ data = waveform[_start: _end]
53
+
54
+ if speech_turn.end - speech_turn.start > vad_activation_min_duration:
55
+ print(f'audio duration {speech_turn.end - speech_turn.start} sec ----> activating VAD')
56
+ vad_output = vad_model({
57
+ 'waveform': waveform_tensor[:, _start:_end],
58
+ 'sample_rate': sampling_rate})
59
+ for vad_turn in vad_output.get_timeline().support():
60
+ vad_start = _start + int(sampling_rate * vad_turn.start)
61
+ vad_end = _start + int(sampling_rate * vad_turn.end)
62
+ prediction = pipe(waveform[vad_start: vad_end])['text']
63
+ history += f"{second_to_timecode(speech_turn.start + vad_turn.start)},{second_to_timecode(speech_turn.start + vad_turn.end)}\n" + \
64
+ f"{prediction}\n\n"
65
+ # f">> {speaker}: {prediction}\n\n"
66
+ yield history, history, None
67
+
68
+ else:
69
+ prediction = pipe(data)['text']
70
+ history += f"{second_to_timecode(speech_turn.start)},{second_to_timecode(speech_turn.end)}\n" + \
71
+ f"{prediction}\n\n"
72
+ # f">> {speaker}: {prediction}\n\n"
73
+
74
+ yield history, history, None
75
+
76
+ # https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats%2Csubrip-srt-example%2Csubviewer-sbv-example
77
+ file_name = 'transcript.sbv'
78
+ with open(file_name, 'w') as fp:
79
+ fp.write(history)
80
+
81
+ yield history, history, file_name
82
+
83
+ demo = gr.Interface(
84
+ generator,
85
  inputs=[
86
+ gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL", optional=True),
87
  gr.inputs.Audio(source="microphone", type="filepath", optional=True),
88
  gr.inputs.Audio(source="upload", type="filepath", optional=True),
89
+ gr.Number(value=1, label="Number of Speakers"),
90
+ gr.Number(value=120, label="Maximum Duration (Seconds)"),
91
+ 'state',
92
  ],
93
+ outputs=['text', 'state', 'file'],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  layout="horizontal",
95
  theme="huggingface",
96
+ title="Transcribe Mongolian Whisper 🇲🇳",
97
  description=(
98
+ "Transcribe Youtube Video / Microphone / Uploaded File in Mongolian Whisper Model." + \
99
+ " | You can upload SubView file (`.sbv`) [to your youtube video](https://support.google.com/youtube/answer/2734698?hl=en#zippy=%2Cbasic-file-formats)." + \
100
+ " | Please REFRESH 🔄 the page after you transcribed!" + \
101
+ " | 🐦 [@_tsogoo_](https://twitter.com/_tsogoo_)" + \
102
+ " | 🤗 [@bayartsogt](https://huggingface.co/bayartsogt)" + \
103
+ ""
104
  ),
105
  allow_flagging="never",
106
  )
107
 
108
+ # define queue - required for generators
109
+ demo.queue()
110
 
111
+ demo.launch()