Spaces:

Pradheep1647
/

multi-modal-emotion-recognition

Running

App Files Files Community

Pradheep1647 commited on Sep 19, 2024

Commit

c3c9064

1 Parent(s): 1be32a3

access browser cookies instead of prompting the user

Browse files

Files changed (2) hide show

app.py +43 -34
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -10,32 +10,42 @@ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
-def authenticate_youtube():
-    os.system('yt-dlp --username oauth2 --password ""')
-def download_youtube_video(video_url, output_path):
     ydl_opts = {
         'format': 'bestvideo+bestaudio',
-        'outtmpl': os.path.join(output_path, '%(title)s.%(ext)s'),
-        'username': 'oauth2',
-        'password': ''
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([video_url])
         video_info = ydl.extract_info(video_url, download=False)
         video_title = video_info.get('title', 'video')
-        return os.path.join(output_path, f"{video_title}.webm")
-def convert_to_mp4(input_path, output_path):
-    output_file = os.path.join(output_path, 'video.mp4')
     command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
     subprocess.run(command, check=True)
     return output_file
 def extract_audio_from_video(video_path):
     video_clip = VideoFileClip(video_path)
-    audio_output = os.path.join(output_path, 'audio.mp3')
     audio_clip = video_clip.audio
     audio_clip.write_audiofile(audio_output)
     return audio_output
@@ -43,7 +53,7 @@ def extract_audio_from_video(video_path):
 def convert_mp3_to_wav(mp3_path):
     from pydub import AudioSegment
     audio = AudioSegment.from_mp3(mp3_path)
-    wav_output = os.path.join(output_path, 'audio.wav')
     audio.export(wav_output, format="wav")
     return wav_output
@@ -94,35 +104,33 @@ emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
 emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
 emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
-def analyze_video(video_url):
     global output_path
     output_path = './'
-    authenticate_youtube()
-    video_path = download_youtube_video(video_url, output_path)
-    mp4_path = convert_to_mp4(video_path, output_path)
     audio_path = extract_audio_from_video(mp4_path)
     audio_wav_path = convert_mp3_to_wav(audio_path)
     model_whisper = whisper.load_model("base")
     result_whisper = model_whisper.transcribe(audio_wav_path)
     transcript = result_whisper['text']
     emotion_dict_text, predicted_emotion_text = process_text(transcript)
     n_frame_interval = 60
     emotion_vectors_video = []
     video_capture = cv2.VideoCapture(mp4_path)
     total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_count_video = 0
     while video_capture.isOpened():
@@ -134,7 +142,7 @@ def analyze_video(video_url):
         if frame_count_video % n_frame_interval == 0:
             pixel_values_video = preprocess_frame(frame_video)
             caption_video = generate_caption(pixel_values_video)
-            predicted_emotions_video = predict_emotions(caption_video)
             emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
         frame_count_video += 1
@@ -152,10 +160,11 @@ def analyze_video(video_url):
     return transcript, predicted_emotion_text, final_most_predicted_emotion
 iface = gr.Interface(fn=analyze_video,
-                     inputs=gr.Textbox(label="YouTube Video URL"),
-                     outputs=["text", "text", "text"],
-                     title="Multimodal Emotion Recognition",
-                     description="Enter a YouTube Video URL to analyze emotions from both audio and visual content.")
 if __name__ == "__main__":
-     iface.launch()

 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from transformers import BlipProcessor, BlipForConditionalGeneration
 import cv2
+import browser_cookie3
+def get_youtube_cookies(browser):
+    if browser == 'Chrome':
+        return browser_cookie3.chrome()
+    elif browser == 'Firefox':
+        return browser_cookie3.firefox()
+    elif browser == 'Edge':
+        return browser_cookie3.edge()
+    elif browser == 'Brave':
+        return browser_cookie3.brave()
+    else:
+        raise ValueError("Unsupported browser")
+def download_youtube_video(video_url, browser):
+    cookies = get_youtube_cookies(browser)
     ydl_opts = {
+        'cookiefile': cookies,
         'format': 'bestvideo+bestaudio',
+        'outtmpl': os.path.join('./', '%(title)s.%(ext)s'),
     }
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([video_url])
         video_info = ydl.extract_info(video_url, download=False)
         video_title = video_info.get('title', 'video')
+        return os.path.join('./', f"{video_title}.webm")
+def convert_to_mp4(input_path):
+    output_file = os.path.join('./', 'video.mp4')
     command = ['ffmpeg', '-i', input_path, '-c', 'copy', output_file]
     subprocess.run(command, check=True)
     return output_file
 def extract_audio_from_video(video_path):
     video_clip = VideoFileClip(video_path)
+    audio_output = os.path.join('./', 'audio.mp3')
     audio_clip = video_clip.audio
     audio_clip.write_audiofile(audio_output)
     return audio_output
 def convert_mp3_to_wav(mp3_path):
     from pydub import AudioSegment
     audio = AudioSegment.from_mp3(mp3_path)
+    wav_output = os.path.join('./', 'audio.wav')
     audio.export(wav_output, format="wav")
     return wav_output
 emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name)
 emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name)
+def analyze_video(video_url, browser):
     global output_path
     output_path = './'
+    video_path = download_youtube_video(video_url, browser)
+    mp4_path = convert_to_mp4(video_path)
     audio_path = extract_audio_from_video(mp4_path)
     audio_wav_path = convert_mp3_to_wav(audio_path)
     model_whisper = whisper.load_model("base")
     result_whisper = model_whisper.transcribe(audio_wav_path)
     transcript = result_whisper['text']
     emotion_dict_text, predicted_emotion_text = process_text(transcript)
     n_frame_interval = 60
     emotion_vectors_video = []
     video_capture = cv2.VideoCapture(mp4_path)
     total_frames_video = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
     frame_count_video = 0
     while video_capture.isOpened():
         if frame_count_video % n_frame_interval == 0:
             pixel_values_video = preprocess_frame(frame_video)
             caption_video = generate_caption(pixel_values_video)
+            predicted_emotions_video, _ = predict_emotions(caption_video)
             emotion_vectors_video.append(np.array(list(predicted_emotions_video.values())))
         frame_count_video += 1
     return transcript, predicted_emotion_text, final_most_predicted_emotion
 iface = gr.Interface(fn=analyze_video,
+                    inputs=[gr.Textbox(label="YouTube Video URL"),
+                            gr.Dropdown(label="Select Browser", choices=["Chrome", "Firefox", "Edge", "Brave"])],
+                    outputs=["text", "text", "text"],
+                    title="Multimodal Emotion Recognition",
+                    description="Enter a YouTube Video URL and select your browser to analyze emotions from both audio and visual content.")
 if __name__ == "__main__":
+    iface.launch()

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ moviepy
 openai-whisper
 yt-dlp
 torch
-opencv-python

 openai-whisper
 yt-dlp
 torch
+opencv-python
+browser-cookie3