Spaces:
Runtime error
Runtime error
File size: 3,317 Bytes
ccbbbf4 b5a2ee4 e4a6674 b5a2ee4 3161b19 b5a2ee4 ccbbbf4 e4a6674 05856c6 e4a6674 ccbbbf4 b5a2ee4 e4a6674 ccbbbf4 93390f1 ccbbbf4 e4a6674 93390f1 e4a6674 5272af9 c4f4e03 e4a6674 b5a2ee4 71216c6 b5a2ee4 7eb62ed b5a2ee4 71216c6 b5a2ee4 c4f4e03 71216c6 c4f4e03 b5a2ee4 c4f4e03 e4a6674 5272af9 c4f4e03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
os.system("pip install git+https://github.com/openai/whisper.git")
import gradio as gr
import whisper
from huggingface_hub import from_pretrained_keras
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.preprocessing import StandardScaler
import logging
import librosa
import numpy as np
import pickle
#call tokenizer and NLP model for text classification
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
# call whisper model for audio/speech processing
model = whisper.load_model("small")
# call model for audio emotions
reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier')
# call scaler and decoder
with open("scaler.pkl", "rb") as f:
scaler = pickle.load(f)
with open("encoder.pkl", "rb") as f:
encoder = pickle.load(f)
def inference_audio(audio):
audio = whisper.load_audio(audio)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
_, probs = model.detect_language(mel)
options = whisper.DecodingOptions(fp16 = False)
result = whisper.decode(model, mel, options)
return result.text
def inference_text(audio):
text =inference_audio(audio)
sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer)
res=sentiment_task(text)[0]
return text,res['label'],res['score']
def extract_features(data):
# ZCR
result = np.array([])
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
result=np.hstack((result, zcr)) # stacking horizontally
# Chroma_stft
stft = np.abs(librosa.stft(data))
chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
result = np.hstack((result, chroma_stft)) # stacking horizontally
# MFCC
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mfcc)) # stacking horizontally
# Root Mean Square Value
rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
result = np.hstack((result, rms)) # stacking horizontally
# MelSpectogram
mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
result = np.hstack((result, mel)) # stacking horizontally
return result
"""
def audio_emotions(audio):
sr,data = audio
features_audio = extract_features(data)
features_audio = np.array(features_audio)
scaled_features=scaler.transform(features_audio)
scaled_features = np.expand_dims(scaled_features, axis=2)
prediction=reloaded_model.predict(scaled_features)
y_pred = encoder.inverse_transform(prediction)
return y_pred
"""
def main(audio):
r1,r2,r3=inference_text(audio)
#r3=audio_emotions(audio)
return r1,r2,r3
audio = gr.Audio(
label="Input Audio",
show_label=False,
source="microphone",
type="filepath"
)
app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True) |