Spaces:

Pradheep1647
/

multi-modal-emotion-recognition

Sleeping

App Files Files Community

Pradheep1647 commited on Sep 19

Commit

1be32a3

•

1 Parent(s): 909f75a

made some changes to app.py(oauth2)

Browse files

Files changed (1) hide show

app.py +21 -2

app.py CHANGED Viewed

@@ -11,10 +11,15 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
 def download_youtube_video(video_url, output_path):
     ydl_opts = {
         'format': 'bestvideo+bestaudio',
         'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([video_url])
@@ -92,21 +97,32 @@ emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model
 def analyze_video(video_url):
     global output_path
     output_path = './'
     video_path = download_youtube_video(video_url, output_path)
     mp4_path = convert_to_mp4(video_path, output_path)
     audio_path = extract_audio_from_video(mp4_path)
     audio_wav_path = convert_mp3_to_wav(audio_path)
     model_whisper = whisper.load_model("base")
     result_whisper = model_whisper.transcribe(audio_wav_path)
     transcript = result_whisper['text']
     emotion_dict_text, predicted_emotion_text = process_text(transcript)
     n_frame_interval = 60
     emotion_vectors_video = []
     video_capture = cv2.VideoCapture(mp4_path)
     total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_count_video = 0
     while video_capture.isOpened():
@@ -118,7 +134,7 @@ def analyze_video(video_url):
         if frame_count_video % n_frame_interval == 0:
             pixel_values_video = preprocess_frame(frame_video)
             caption_video = generate_caption(pixel_values_video)
-            predicted_emotions_video, _ = predict_emotions(caption_video)
             emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
         frame_count_video += 1
@@ -126,8 +142,11 @@ def analyze_video(video_url):
     video_capture.release()
     average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
     combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
     final_most_predicted_index = np.argmax(combined_emotion_vector_final)
     final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
     return transcript, predicted_emotion_text, final_most_predicted_emotion
@@ -139,4 +158,4 @@ iface = gr.Interface(fn=analyze_video,
                      description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
 if __name__ == "__main__":
-    iface.launch()

 from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
+def authenticate_youtube():
+    os.system('yt-dlp --username oauth2 --password ""')
 def download_youtube_video(video_url, output_path):
     ydl_opts = {
         'format': 'bestvideo+bestaudio',
         'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
+        'username': 'oauth2',
+        'password': ''
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([video_url])
 def analyze_video(video_url):
     global output_path
     output_path = './'
+    authenticate_youtube()
     video_path = download_youtube_video(video_url, output_path)
     mp4_path = convert_to_mp4(video_path, output_path)
     audio_path = extract_audio_from_video(mp4_path)
     audio_wav_path = convert_mp3_to_wav(audio_path)
     model_whisper = whisper.load_model("base")
     result_whisper = model_whisper.transcribe(audio_wav_path)
     transcript = result_whisper['text']
     emotion_dict_text, predicted_emotion_text = process_text(transcript)
     n_frame_interval = 60
     emotion_vectors_video = []
     video_capture = cv2.VideoCapture(mp4_path)
     total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_count_video = 0
     while video_capture.isOpened():
         if frame_count_video % n_frame_interval == 0:
             pixel_values_video = preprocess_frame(frame_video)
             caption_video = generate_caption(pixel_values_video)
+            predicted_emotions_video = predict_emotions(caption_video)
             emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
         frame_count_video += 1
     video_capture.release()
     average_emotion_vector_video = np.mean(emotion_vectors_video, axis=0)
     combined_emotion_vector_final = np.concatenate((np.array(list(emotion_dict_text.values())), average_emotion_vector_video))
     final_most_predicted_index = np.argmax(combined_emotion_vector_final)
     final_most_predicted_emotion = list(emotion_dict_text.keys())[final_most_predicted_index]
     return transcript, predicted_emotion_text, final_most_predicted_emotion
                      description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
 if __name__ == "__main__":
+     iface.launch()