import gradio as gr from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration import torch import torchaudio import soundfile as sf # Load Whisper model and processor processor = WhisperProcessor.from_pretrained("openai/whisper-large") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large") # Load the Hugging Face emotion classifier emotion_classifier = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None) # Define a function to process audio and analyze emotions def transcribe_and_analyze(audio_path): # Load audio from the provided file audio, sample_rate = sf.read(audio_path) # Resample audio to 16000 Hz if necessary print('resample') if sample_rate != 16000: audio_tensor = torchaudio.functional.resample(torch.tensor(audio), orig_freq=sample_rate, new_freq=16000) audio = audio_tensor.numpy() # Convert back to numpy array # Process audio with Whisper # input_features = model(audio) input_features = processor(audio, sampling_rate=16000, return_tensors="pt") print(input_features) print('trans') predicted_ids = model.generate(input_features.input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] print(transcription) # Analyze emotions in the transcription emotions = emotion_classifier(transcription) return transcription, emotions # Create Gradio interface interface = gr.Interface( fn=transcribe_and_analyze, inputs=gr.Audio(type="filepath"), # Accept audio input outputs=[ gr.Textbox(label="Transcription"), # Display transcription gr.JSON(label="Emotion Analysis") # Display emotion analysis ], title="Audio to Emotion Analysis" ) # Launch the Gradio app interface.launch()