Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration | |
import torch | |
import torchaudio | |
import soundfile as sf | |
# Load Whisper model and processor | |
processor = WhisperProcessor.from_pretrained("openai/whisper-large") | |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") | |
# Load the Hugging Face emotion classifier | |
emotion_classifier = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) | |
# Define a function to process audio and analyze emotions | |
def transcribe_and_analyze(audio_path): | |
# Load audio from the provided file | |
audio, sample_rate = sf.read(audio_path) | |
# Resample audio to 16000 Hz if necessary | |
print('resample') | |
if sample_rate != 16000: | |
audio_tensor = torchaudio.functional.resample(torch.tensor(audio), orig_freq=sample_rate, new_freq=16000) | |
audio = audio_tensor.numpy() # Convert back to numpy array | |
# Process audio with Whisper | |
# input_features = model(audio) | |
input_features = processor(audio, sampling_rate=16000, return_tensors="pt") | |
print(input_features) | |
print('trans') | |
predicted_ids = model.generate(input_features.input_features) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
print(transcription) | |
# Analyze emotions in the transcription | |
emotions = emotion_classifier(transcription) | |
return transcription, emotions | |
# Create Gradio interface | |
interface = gr.Interface( | |
fn=transcribe_and_analyze, | |
inputs=gr.Audio(type="filepath"), # Accept audio input | |
outputs=[ | |
gr.Textbox(label="Transcription"), # Display transcription | |
gr.JSON(label="Emotion Analysis") # Display emotion analysis | |
], | |
title="Audio to Emotion Analysis" | |
) | |
# Launch the Gradio app | |
interface.launch() | |