import os os.system("pip install git+https://github.com/openai/whisper.git") import gradio as gr import whisper from huggingface_hub import from_pretrained_keras from transformers import AutoTokenizer, AutoModelForSequenceClassification from transformers import pipeline from sklearn.preprocessing import StandardScaler import logging import librosa import numpy as np import pickle #call tokenizer and NLP model for text classification tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") model_nlp = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest") # call whisper model for audio/speech processing model = whisper.load_model("small") # call model for audio emotions reloaded_model = from_pretrained_keras('jmparejaz/RAVDESS-CREMAD_AudioEmotionClassifier') # call scaler and decoder with open("scaler.pkl", "rb") as f: scaler = pickle.load(f) with open("encoder.pkl", "rb") as f: encoder = pickle.load(f) def inference_audio(audio): audio = whisper.load_audio(audio) audio = whisper.pad_or_trim(audio) mel = whisper.log_mel_spectrogram(audio).to(model.device) _, probs = model.detect_language(mel) options = whisper.DecodingOptions(fp16 = False) result = whisper.decode(model, mel, options) return result.text def inference_text(audio): text =inference_audio(audio) sentiment_task = pipeline("sentiment-analysis", model=model_nlp, tokenizer=tokenizer) res=sentiment_task(text)[0] return res['label'],res['score'] def extract_features(data): # ZCR result = np.array([]) zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0) result=np.hstack((result, zcr)) # stacking horizontally # Chroma_stft stft = np.abs(librosa.stft(data)) chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) result = np.hstack((result, chroma_stft)) # stacking horizontally # MFCC mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0) result = np.hstack((result, mfcc)) # stacking horizontally # Root Mean Square Value rms = np.mean(librosa.feature.rms(y=data).T, axis=0) result = np.hstack((result, rms)) # stacking horizontally # MelSpectogram mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0) result = np.hstack((result, mel)) # stacking horizontally return result def audio_emotions(audio): data = audio.flatten() sr=22050 features_audio = extract_features(data) features_audio = np.array(features_audio) scaled_features=scaler.transform(features_audio) scaled_features = np.expand_dims(scaled_features, axis=2) prediction=reloaded_model.predict(scaled_features) y_pred = encoder.inverse_transform(prediction) return y_pred def main(audio): r1,r2=inference_text(audio) r3=audio_emotions(audio) return r1,r2,r3 audio = gr.Audio( label="Input Audio", show_label=False, source="microphone", type="filepath" ) app=gr.Interface(title="Sentiment Audio Analysis",fn=main,inputs=[audio], outputs=["text","text","text"]) app.launch(share=True)