Spaces:
Runtime error
Runtime error
import os | |
os.system("pip install git+https://github.com/openai/whisper.git") | |
import gradio as gr | |
import whisper | |
from huggingface_hub import from_pretrained_keras | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
from transformers import pipeline | |
from sklearn.preprocessing import StandardScaler | |
import logging | |
import librosa | |
import numpy as np | |
import pickle | |
#call tokenizer and NLP model for text classification | |
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") | |
model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") | |
# call whisper model for audio/speech processing | |
model = whisper.load_model("small") | |
# call model for audio emotions | |
reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier') | |
# call scaler and decoder | |
with open("scaler.pkl", "rb") as f: | |
scaler = pickle.load(f) | |
with open("encoder.pkl", "rb") as f: | |
encoder = pickle.load(f) | |
def inference_audio(audio): | |
audio = whisper.load_audio(audio) | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
_, probs = model.detect_language(mel) | |
options = whisper.DecodingOptions(fp16 = False) | |
result = whisper.decode(model, mel, options) | |
return result.text | |
def inference_text(audio): | |
text =inference_audio(audio) | |
sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer) | |
res=sentiment_task(text)[0] | |
return text,res['label'],res['score'] | |
def extract_features(data): | |
# ZCR | |
result = np.array([]) | |
zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0) | |
result=np.hstack((result, zcr)) # stacking horizontally | |
# Chroma_stft | |
stft = np.abs(librosa.stft(data)) | |
chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) | |
result = np.hstack((result, chroma_stft)) # stacking horizontally | |
# MFCC | |
mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0) | |
result = np.hstack((result, mfcc)) # stacking horizontally | |
# Root Mean Square Value | |
rms = np.mean(librosa.feature.rms(y=data).T, axis=0) | |
result = np.hstack((result, rms)) # stacking horizontally | |
# MelSpectogram | |
mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0) | |
result = np.hstack((result, mel)) # stacking horizontally | |
return result | |
""" | |
def audio_emotions(audio): | |
sr,data = audio | |
features_audio = extract_features(data) | |
features_audio = np.array(features_audio) | |
scaled_features=scaler.transform(features_audio) | |
scaled_features = np.expand_dims(scaled_features, axis=2) | |
prediction=reloaded_model.predict(scaled_features) | |
y_pred = encoder.inverse_transform(prediction) | |
return y_pred | |
""" | |
def main(audio): | |
r1,r2,r3=inference_text(audio) | |
#r3=audio_emotions(audio) | |
return r1,r2,r3 | |
audio = gr.Audio( | |
label="Input Audio", | |
show_label=False, | |
source="microphone", | |
type="filepath" | |
) | |
app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=audio, outputs=["text","text","text"]).launch(debug = True) |