File size: 1,862 Bytes
2327692
a094510
0759a7f
daebcf8
de9f399
2327692
a094510
 
 
 
 
80d6d93
2327692
a094510
 
de9f399
 
 
daebcf8
f5d0beb
daebcf8
 
 
6203c8d
a094510
3c4b71e
 
1b66efd
6203c8d
1b66efd
666588f
a094510
f5d0beb
a094510
 
 
 
2327692
 
 
 
a094510
2327692
a094510
 
2327692
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch 
import torchaudio
import soundfile as sf

# Load Whisper model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")

# Load the Hugging Face emotion classifier
emotion_classifier = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=None)

# Define a function to process audio and analyze emotions
def transcribe_and_analyze(audio_path):
    # Load audio from the provided file
    audio, sample_rate = sf.read(audio_path)
    
    # Resample audio to 16000 Hz if necessary
    print('resample')
    if sample_rate != 16000:
        audio_tensor = torchaudio.functional.resample(torch.tensor(audio), orig_freq=sample_rate, new_freq=16000)
        audio = audio_tensor.numpy()  # Convert back to numpy array
    
    # Process audio with Whisper
    # input_features = model(audio)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt")
    print(input_features)
    print('trans')
    predicted_ids = model.generate(input_features.input_features)
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    print(transcription)

    # Analyze emotions in the transcription
    emotions = emotion_classifier(transcription)
    return transcription, emotions

# Create Gradio interface
interface = gr.Interface(
    fn=transcribe_and_analyze,
    inputs=gr.Audio(type="filepath"),  # Accept audio input
    outputs=[
        gr.Textbox(label="Transcription"),  # Display transcription
        gr.JSON(label="Emotion Analysis")  # Display emotion analysis
    ],
    title="Audio to Emotion Analysis"
)

# Launch the Gradio app
interface.launch()