File size: 5,130 Bytes
3335977 628e92e 3335977 136ef53 5fd6be5 136ef53 5fd6be5 06a02ce 5fd6be5 628e92e 3335977 968d6cc 5fd6be5 628e92e 136ef53 968d6cc 068748e 5fd6be5 136ef53 5fd6be5 136ef53 5fd6be5 136ef53 5fd6be5 f6c51ca 5fd6be5 136ef53 5fd6be5 136ef53 5fd6be5 136ef53 5fd6be5 628e92e 5fd6be5 136ef53 3335977 5fd6be5 50e3e8a 5fd6be5 136ef53 5fd6be5 136ef53 5fd6be5 04b484a 5fd6be5 6a7764f ef46332 628e92e 5fd6be5 3335977 628e92e 3335977 5fd6be5 3335977 5fd6be5 3335977 8a85641 3335977 628e92e 5fd6be5 8a85641 5fd6be5 628e92e 5fd6be5 628e92e 3335977 5fd6be5 3335977 aa45e11 5fd6be5 3335977 628e92e 3335977 5fd6be5 6a7764f aa45e11 5fd6be5 3335977 5fd6be5 3335977 01daa7a 5fd6be5 3335977 628e92e 3335977 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
import tempfile
import os
import locale
import whisper
import datetime
import subprocess
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import wave
import contextlib
from sklearn.cluster import AgglomerativeClustering
import numpy as np
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
FILE_LIMIT_MB = 100
COMPUTE_TYPE = "float32"
YT_LENGTH_LIMIT_S = 600 # limit to 1 hour YouTube files
num_speakers = 2
language = "French"
model_size = 'tiny'
model_name = model_size
def getpreferredencoding(do_setlocale = True):
return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=torch.device("cpu"))
model = whisper.load_model(model_size).to(device)
audio = Audio()
def segment_embedding(segment,duration,path):
start = segment["start"]
# Whisper overshoots the end timestamp in the last segment
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio.crop(path, clip)
# Convert waveform to single channel
waveform = waveform.mean(dim=0, keepdim=True)
return embedding_model(waveform.unsqueeze(0))
def time(secs):
return datetime.timedelta(seconds=round(secs))
@spaces.GPU
def transcribe(path, task):
if path is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
if path[-3:] != 'wav':
subprocess.call(['ffmpeg', '-i', path, "audio.wav", '-y'])
path = "audio.wav"
result = model.transcribe(path,fp16=False)
segments = result["segments"]
print(segments)
with contextlib.closing(wave.open(path,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment,duration=duration,path=path)
embeddings = np.nan_to_num(embeddings)
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
labels = clustering.labels_
output_text=""
for i in range(len(segments)):
segments[i]["speaker"] = '**SPEAKER ' + str(labels[i] + 1) + "**"
for (i, segment) in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
output_text += " "+segment["speaker"] + ' : '
output_text += segment["text"][1:] + ' <br> '
return output_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
def download_yt_audio(yt_url, filename):
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": filename,
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
"preferredquality": "192",
}],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
@spaces.GPU
def yt_transcribe(yt_url, task):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "audio.wav")
download_yt_audio(yt_url, filepath)
result = model.transcribe(audio, batch_size=BATCH_SIZE)
return html_embed_str, result["text"]
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="VerbaLens Demo 1 : Prototype",
description="Transcribe long-form microphone or audio inputs using WhisperX.",
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs=gr.Markdown(label="Sortie Markdown",height=500),
title="VerbaLens Demo 1 : Prototype",
description="Transcribe uploaded audio files using WhisperX.",
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs=["html", "text"],
title="VerbaLens Demo 1 : Prototyping",
description="Transcribe YouTube videos using WhisperX.",
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
demo.queue().launch(ssr_mode=False)
|