shravan / app.py
peb-peb's picture
add UI interface
4e8414c
raw
history blame
4.51 kB
# import whisper
import gradio as gr
import datetime
import subprocess
import wave
import contextlib
# import torch
# import pyannote.audio
# from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
# from pyannote.audio import Audio
# from pyannote.core import Segment
# from sklearn.cluster import AgglomerativeClustering
# import numpy as np
# model = whisper.load_model("large-v2")
# embedding_model = PretrainedSpeakerEmbedding(
# "speechbrain/spkrec-ecapa-voxceleb",
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# )
# def transcribe(audio, num_speakers):
# path, error = convert_to_wav(audio)
# if error is not None:
# return error
# duration = get_duration(path)
# if duration > 4 * 60 * 60:
# return "Audio duration too long"
# result = model.transcribe(path)
# segments = result["segments"]
# num_speakers = min(max(round(num_speakers), 1), len(segments))
# if len(segments) == 1:
# segments[0]['speaker'] = 'SPEAKER 1'
# else:
# embeddings = make_embeddings(path, segments, duration)
# add_speaker_labels(segments, embeddings, num_speakers)
# output = get_output(segments)
# return output
# def convert_to_wav(path):
# if path[-3:] != 'wav':
# new_path = '.'.join(path.split('.')[:-1]) + '.wav'
# try:
# subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
# except:
# return path, 'Error: Could not convert file to .wav'
# path = new_path
# return path, None
# def get_duration(path):
# with contextlib.closing(wave.open(path,'r')) as f:
# frames = f.getnframes()
# rate = f.getframerate()
# return frames / float(rate)
# def make_embeddings(path, segments, duration):
# embeddings = np.zeros(shape=(len(segments), 192))
# for i, segment in enumerate(segments):
# embeddings[i] = segment_embedding(path, segment, duration)
# return np.nan_to_num(embeddings)
# audio = Audio()
# def segment_embedding(path, segment, duration):
# start = segment["start"]
# # Whisper overshoots the end timestamp in the last segment
# end = min(duration, segment["end"])
# clip = Segment(start, end)
# waveform, sample_rate = audio.crop(path, clip)
# return embedding_model(waveform[None])
# def add_speaker_labels(segments, embeddings, num_speakers):
# clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
# labels = clustering.labels_
# for i in range(len(segments)):
# segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
# def time(secs):
# return datetime.timedelta(seconds=round(secs))
# def get_output(segments):
# output = ''
# for (i, segment) in enumerate(segments):
# if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
# if i != 0:
# output += '\n\n'
# output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
# output += segment["text"][1:] + ' '
# return output
s = ""
def greet1(name):
global s
s = "modified"
return "Hello " + name + "!"
def greet2(name):
return "Hi " + name + "!" + " " + s
def greet3(name):
return "Hola " + name + "!"
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
audio_file = gr.File(label="Upload a Audio file (.wav)", file_count=1)
# name = gr.Textbox(label="Name", placeholder="Name") # TODO: remove
number_of_speakers = gr.Number(label="Number of Speakers", value=2)
with gr.Row():
btn_clear = gr.Button(value="Clear")
btn_submit = gr.Button(value="Submit")
with gr.Column():
title = gr.Textbox(label="Title", placeholder="Title for Conversation")
short_summary = gr.Textbox(label="Short Summary", placeholder="Short Summary for Conversation")
sentiment_analysis = gr.Textbox(label="Sentiment Analysis", placeholder="Sentiment Analysis for Conversation")
quality = gr.Textbox(label="Quality of Conversation", placeholder="Quality of Conversation")
detailed_summary = gr.Textbox(label="Detailed Summary", placeholder="Detailed Summary for Conversation")
gr.Markdown("## Examples")
gr.Examples(
examples=[
[
"Harsh",
2,
],
[
"Rahul",
2,
],
],
inputs=[title],
outputs=[short_summary],
fn=greet1,
cache_examples=True,
)
gr.Markdown(
"""
See [github.com/facebookresearch/audiocraft](https://github.com/facebookresearch/audiocraft)
for more details.
"""
)
demo.launch()