Spaces:

Ngoufack
/

verbalens

Running

App Files Files Community

verbalens / app.py

Ngoufack

sad

6a7764f 4 months ago

raw

history blame contribute delete

5.13 kB

	import spaces
	import torch
	import gradio as gr
	import yt_dlp as youtube_dl
	import tempfile
	import os
	import locale
	import whisper
	import datetime
	import subprocess
	import pyannote.audio
	from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
	from pyannote.audio import Audio
	from pyannote.core import Segment
	import wave
	import contextlib
	from sklearn.cluster import AgglomerativeClustering
	import numpy as np

	device = "cuda" if torch.cuda.is_available() else "cpu"
	BATCH_SIZE = 4
	FILE_LIMIT_MB = 100
	COMPUTE_TYPE = "float32"
	YT_LENGTH_LIMIT_S = 600 # limit to 1 hour YouTube files

	num_speakers = 2
	language = "French"
	model_size = 'tiny'
	model_name = model_size

	def getpreferredencoding(do_setlocale = True):
	return "UTF-8"

	locale.getpreferredencoding = getpreferredencoding
	embedding_model = PretrainedSpeakerEmbedding(
	"speechbrain/spkrec-ecapa-voxceleb",
	device=torch.device("cpu"))
	model = whisper.load_model(model_size).to(device)
	audio = Audio()

	def segment_embedding(segment,duration,path):
	start = segment["start"]
	# Whisper overshoots the end timestamp in the last segment
	end = min(duration, segment["end"])
	clip = Segment(start, end)
	waveform, sample_rate = audio.crop(path, clip)

	# Convert waveform to single channel
	waveform = waveform.mean(dim=0, keepdim=True)

	return embedding_model(waveform.unsqueeze(0))

	def time(secs):
	return datetime.timedelta(seconds=round(secs))

	@spaces.GPU
	def transcribe(path, task):
	if path is None:
	raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")

	if path[-3:] != 'wav':
	subprocess.call(['ffmpeg', '-i', path, "audio.wav", '-y'])
	path = "audio.wav"
	result = model.transcribe(path,fp16=False)
	segments = result["segments"]
	print(segments)
	with contextlib.closing(wave.open(path,'r')) as f:
	frames = f.getnframes()
	rate = f.getframerate()
	duration = frames / float(rate)

	embeddings = np.zeros(shape=(len(segments), 192))
	for i, segment in enumerate(segments):
	embeddings[i] = segment_embedding(segment,duration=duration,path=path)
	embeddings = np.nan_to_num(embeddings)
	clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
	labels = clustering.labels_
	output_text=""
	for i in range(len(segments)):
	segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) + ""
	for (i, segment) in enumerate(segments):
	if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
	output_text += " "+segment["speaker"] + ' : '
	output_text += segment["text"][1:] + ' <br> '
	return output_text



	def _return_yt_html_embed(yt_url):
	video_id = yt_url.split("?v=")[-1]
	return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'

	def download_yt_audio(yt_url, filename):
	ydl_opts = {
	"format": "bestaudio/best",
	"outtmpl": filename,
	"postprocessors": [{
	"key": "FFmpegExtractAudio",
	"preferredcodec": "wav",
	"preferredquality": "192",
	}],
	}

	with youtube_dl.YoutubeDL(ydl_opts) as ydl:
	ydl.download([yt_url])

	@spaces.GPU
	def yt_transcribe(yt_url, task):
	html_embed_str = _return_yt_html_embed(yt_url)

	with tempfile.TemporaryDirectory() as tmpdirname:
	filepath = os.path.join(tmpdirname, "audio.wav")
	download_yt_audio(yt_url, filepath)

	result = model.transcribe(audio, batch_size=BATCH_SIZE)

	return html_embed_str, result["text"]

	demo = gr.Blocks(theme=gr.themes.Ocean())

	mf_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources="microphone", type="filepath"),
	gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
	],
	outputs="text",
	title="VerbaLens Demo 1 : Prototype",
	description="Transcribe long-form microphone or audio inputs using WhisperX.",
	allow_flagging="never",
	)

	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=[
	gr.Audio(sources="upload", type="filepath", label="Audio file"),
	gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
	],
	outputs=gr.Markdown(label="Sortie Markdown",height=500),
	title="VerbaLens Demo 1 : Prototype",
	description="Transcribe uploaded audio files using WhisperX.",
	allow_flagging="never",
	)

	yt_transcribe = gr.Interface(
	fn=yt_transcribe,
	inputs=[
	gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
	gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
	],
	outputs=["html", "text"],
	title="VerbaLens Demo 1 : Prototyping",
	description="Transcribe YouTube videos using WhisperX.",
	allow_flagging="never",
	)

	with demo:
	gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])

	demo.queue().launch(ssr_mode=False)