File size: 6,445 Bytes
d6d8b90 9f703fc 0a94d84 d6d8b90 c3c9064 d6d8b90 c3c9064 d6d8b90 9f703fc d6d8b90 9f703fc d6d8b90 9f703fc d6d8b90 9f703fc d6d8b90 3e6c751 d6d8b90 9f703fc d6d8b90 9f703fc d6d8b90 9f703fc d6d8b90 03c677b d6d8b90 03c677b 1be32a3 03c677b c3c9064 03c677b c3c9064 03c677b 3e6c751 03c677b c3c9064 03c677b d6d8b90 03c677b d6d8b90 03c677b d6d8b90 03c677b d6d8b90 03c677b d6d8b90 03c677b d6d8b90 03c677b d6d8b90 03c677b d6d8b90 03c677b 9f703fc 03c677b 9f703fc 03c677b 9f703fc cbf53ef 9f703fc 03c677b 778504d 03c677b 778504d 909f75a d6d8b90 9f703fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
import os
import subprocess
import gradio as gr
import whisper
import yt_dlp
import torch
import numpy as np
from moviepy.editor import VideoFileClip
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BlipProcessor, BlipForConditionalGeneration
import cv2
emotion_labels = ['anger', 'joy', 'optimism', 'sad']
def extract_audio_from_video(video_path):
video_clip = VideoFileClip(video_path)
audio_output = os.path.join('./', 'audio.mp3')
audio_clip = video_clip.audio
audio_clip.write_audiofile(audio_output)
return audio_output
def convert_mp3_to_wav(mp3_path):
from pydub import AudioSegment
audio = AudioSegment.from_mp3(mp3_path)
wav_output = os.path.join('./', 'audio.wav')
audio.export(wav_output, format="wav")
return wav_output
def process_text(text):
model_name = "cardiffnlp/twitter-roberta-base-emotion"
emotion_labels = ['anger', 'joy', 'optimism', 'sad']
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
emotion_probs = torch.softmax(logits, dim=-1).squeeze()
predicted_emotion = emotion_labels[torch.argmax(emotion_probs)]
emotion_dict = {emotion_labels[i]: emotion_probs[i].item() for i in range(len(emotion_labels))}
return emotion_dict, predicted_emotion
def preprocess_frame(frame):
frame = cv2.resize(frame, (112, 112))
pixel_values = caption_processor(images=frame, return_tensors="pt").pixel_values
return pixel_values
def generate_caption(pixel_values):
caption_ids = caption_model.generate(pixel_values)
caption = caption_processor.batch_decode(caption_ids, skip_special_tokens=True)[0]
return caption
def predict_emotions(caption):
inputs = emotion_tokenizer(caption, return_tensors='pt', truncation=True, padding=True)
outputs = emotion_model(**inputs)
emotion_probs = torch.softmax(outputs.logits, dim=1)
predicted_emotions = {label: prob.item() for label, prob in zip(emotion_labels, emotion_probs[0])}
return predicted_emotions
# Models for image captioning and emotion analysis
caption_model_name = "Salesforce/blip-image-captioning-base"
caption_processor = BlipProcessor.from_pretrained(caption_model_name)
caption_model = BlipForConditionalGeneration.from_pretrained(caption_model_name)
emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
def analyze_video(video=None, video_url=None):
if video is not None:
# If a video is uploaded, process the uploaded file
video_path = video
elif video_url:
# For streaming YouTube video, just embed the link (assuming it's embedded using Gradio)
video_path = None
# If the video is uploaded, extract audio
if video_path:
audio_path = extract_audio_from_video(video_path)
audio_wav_path = convert_mp3_to_wav(audio_path)
model_whisper = whisper.load_model("base")
result_whisper = model_whisper.transcribe(audio_wav_path)
transcript = result_whisper['text']
emotion_dict_text, predicted_emotion_text = process_text(transcript)
# Frame-wise emotion detection from the video
n_frame_interval = 120
emotion_vectors_video = []
video_capture = cv2.VideoCapture(video_path)
total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
frame_count_video = 0
while video_capture.isOpened():
ret_video, frame_video = video_capture.read()
if not ret_video or frame_count_video > total_frames_video:
break
if frame_count_video % n_frame_interval == 0:
pixel_values_video = preprocess_frame(frame_video)
caption_video = generate_caption(pixel_values_video)
predicted_emotions_video = predict_emotions(caption_video)
emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
frame_count_video += 1
video_capture.release()
average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
final_most_predicted_index = np.argmax(combined_emotion_vector_final)
final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
return transcript, predicted_emotion_text, final_most_predicted_emotion
else:
# For streaming, return an empty analysis or handle the embedding in the Gradio UI
return None, "Streaming video detected (no processing).", "N/A"
# Gradio Interface
with gr.Blocks() as iface:
gr.Markdown("# 🎥 Multimodal Emotion Recognition\nUpload a video or input a YouTube video URL to analyze emotions from audio and video frames.")
with gr.Tabs():
with gr.TabItem("Upload Video"):
video_file = gr.File(label="Upload Video File", file_types=["video"])
submit_button_file = gr.Button("Analyze Uploaded Video")
with gr.TabItem("YouTube URL"):
video_url = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL")
submit_button_url = gr.Button("Analyze YouTube Video")
with gr.Row():
transcript_output = gr.Textbox(label="Transcript", interactive=False)
audio_emotion_output = gr.Textbox(label="Emotion from Audio and Text", interactive=False)
visual_emotion_output = gr.Textbox(label="Emotion from Video", interactive=False)
# For uploaded video
submit_button_file.click(analyze_video, inputs=video_file, outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
# For YouTube streaming (no downloading)
submit_button_url.click(analyze_video, inputs=video_url, outputs=[transcript_output, audio_emotion_output, visual_emotion_output])
if __name__ == "__main__":
iface.launch()
|