File size: 5,109 Bytes
3335977 628e92e 3335977 136ef53 628e92e 3335977 136ef53 3335977 f6c51ca 3335977 628e92e 136ef53 34e6e4e 136ef53 f6c51ca 1357ff0 136ef53 f6c51ca 136ef53 628e92e 3335977 136ef53 3335977 136ef53 50e3e8a fbdcb83 136ef53 ef46332 628e92e 136ef53 3335977 628e92e 3335977 5720c76 3335977 8a85641 3335977 628e92e 3335977 8a85641 3335977 628e92e 3335977 628e92e 3335977 aa45e11 3335977 628e92e 3335977 aa45e11 3335977 01daa7a 3335977 628e92e 3335977 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
import tempfile
import os
import locale
import whisper
import datetime
import subprocess
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
from sklearn.cluster import AgglomerativeClustering
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
COMPUTE_TYPE = "float32"
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
num_speakers = 2
language = None
model_size = 'tiny'
model_name = model_size
def getpreferredencoding(do_setlocale = True):
return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cpu"))
model = whisper.load_model(model_size).to(device)
audio = Audio()
def segment_embedding(segment,duration,path):
start = segment["start"]
# Whisper overshoots the end timestamp in the last segment
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(path, clip)
# Convert waveform to single channel
waveform = waveform.mean(dim=0, keepdim=True)
return embedding_model(waveform.unsqueeze(0))
def time(secs):
return datetime.timedelta(seconds=round(secs))
@spaces.GPU
def transcribe(path, task):
if path is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
if path[-3:] != 'wav':
subprocess.call(['ffmpeg', '-i', path, "audio.wav", '-y'])
path = "audio.wav"
result = model.transcribe(path,fp16=False)
segments = result["segments"]
print(segments)
with contextlib.closing(wave.open(path,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment,duration=duration,path=path)
embeddings = np.nan_to_num(embeddings)
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
output_text=""
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
output_text += "\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
output_text += segment["text"][1:] + ' '
return output_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
def download_yt_audio(yt_url, filename):
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": filename,
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
"preferredquality": "192",
}],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
@spaces.GPU
def yt_transcribe(yt_url, task):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "audio.wav")
download_yt_audio(yt_url, filepath)
result = model.transcribe(audio, batch_size=BATCH_SIZE)
return html_embed_str, result["text"]
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="VerbaLens Demo 1 : Prototype",
description="Transcribe long-form microphone or audio inputs using WhisperX.",
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="VerbaLens Demo 1 : Prototype",
description="Transcribe uploaded audio files using WhisperX.",
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs=["html", "text"],
title="VerbaLens Demo 1 : Prototyping",
description="Transcribe YouTube videos using WhisperX.",
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
demo.queue().launch(ssr_mode=False)
|