Kevin King commited on
Commit
ea6ec54
Β·
1 Parent(s): b18efa0

REFAC: Update model loading to use staged approach and enhance audio analysis in Streamlit app

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +273 -180
src/streamlit_app.py CHANGED
@@ -38,10 +38,10 @@ SER_TO_UNIFIED = {'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
38
  FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
39
  AUDIO_SAMPLE_RATE = 16000
40
 
41
- # --- Model Loading ---
42
  @st.cache_resource
43
- def load_models():
44
- with st.spinner("Loading AI models, this may take a moment..."):
45
  whisper_model = whisper.load_model("tiny.en", download_root=os.path.join(CACHE_DIR, "whisper"))
46
  text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
47
  ser_model_name = "superb/hubert-large-superb-er"
@@ -49,7 +49,7 @@ def load_models():
49
  ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
50
  return whisper_model, text_classifier, ser_model, ser_feature_extractor
51
 
52
- whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
53
 
54
  # --- Helper Functions for Analysis ---
55
  def create_unified_vector(scores_dict, mapping_dict):
@@ -72,203 +72,296 @@ def get_consistency_level(cosine_sim):
72
  if cosine_sim >= 0.3: return "Low"
73
  return "Very Low"
74
 
75
- # --- UI and Processing Logic ---
76
- uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
77
-
78
- if uploaded_file is not None:
79
- temp_video_path = None
80
- try:
81
- with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
82
- tfile.write(uploaded_file.read())
83
- temp_video_path = tfile.name
 
84
 
85
- st.video(temp_video_path)
 
 
86
 
87
- if st.button("Analyze Video"):
88
- fer_timeline, ser_timeline, ter_timeline = {}, {}, {}
89
- full_transcription = "No speech detected."
90
 
91
- video_clip_for_duration = VideoFileClip(temp_video_path)
92
- duration = video_clip_for_duration.duration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- with st.spinner("Analyzing facial expressions..."):
95
- cap = cv2.VideoCapture(temp_video_path)
96
- fps = cap.get(cv2.CAP_PROP_FPS) or 30
97
- frame_count = 0
98
- while cap.isOpened():
99
- ret, frame = cap.read()
100
- if not ret: break
101
- timestamp = frame_count / fps
102
- if frame_count % int(fps) == 0:
103
- analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
104
- if isinstance(analysis, list) and len(analysis) > 0:
105
- fer_timeline[timestamp] = {k: v / 100.0 for k, v in analysis[0]['emotion'].items()}
106
- frame_count += 1
107
- cap.release()
108
-
109
- with st.spinner("Analyzing audio and text..."):
110
- if video_clip_for_duration.audio:
111
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
112
- video_clip_for_duration.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
113
- temp_audio_path = taudio.name
114
 
115
- whisper_result = whisper_model.transcribe(
116
- temp_audio_path,
117
- word_timestamps=True,
118
- fp16=False,
119
- condition_on_previous_text=False
120
- )
121
- full_transcription = whisper_result['text'].strip()
122
-
123
- audio_array, _ = sf.read(temp_audio_path, dtype='float32')
124
- if audio_array.ndim == 2: audio_array = audio_array.mean(axis=1)
125
 
126
- for i in range(int(duration)):
127
- start_sample, end_sample = i * AUDIO_SAMPLE_RATE, (i + 1) * AUDIO_SAMPLE_RATE
128
- chunk = audio_array[start_sample:end_sample]
129
-
130
- if len(chunk) > 400:
131
- inputs = ser_feature_extractor(chunk, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
132
- with torch.no_grad():
133
- logits = ser_model(**inputs).logits
134
- scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
135
- ser_timeline[i] = {ser_model.config.id2label[k]: score.item() for k, score in enumerate(scores)}
136
 
137
- words_in_segment = [seg['word'] for seg in whisper_result.get('segments', []) if seg['start'] >= i and seg['start'] < i+1 for seg in seg.get('words', [])]
138
- segment_text = " ".join(words_in_segment).strip()
139
- if segment_text:
140
- text_emotions = text_classifier(segment_text)[0]
141
- ter_timeline[i] = {emo['label']: emo['score'] for emo in text_emotions}
142
 
143
- st.header("Analysis Results")
 
 
 
 
144
 
145
- def process_timeline_to_df(timeline, mapping):
146
- if not timeline: return pd.DataFrame(columns=UNIFIED_EMOTIONS)
147
- df = pd.DataFrame.from_dict(timeline, orient='index')
148
- df_unified = pd.DataFrame(index=df.index, columns=UNIFIED_EMOTIONS).fillna(0.0)
149
- for raw_col in df.columns:
150
- unified_col = mapping.get(raw_col)
151
- if unified_col:
152
- df_unified[unified_col] += df[raw_col]
153
- return df_unified
154
 
155
- fer_df = process_timeline_to_df(fer_timeline, FACIAL_TO_UNIFIED)
156
- ser_df = process_timeline_to_df(ser_timeline, SER_TO_UNIFIED)
157
- ter_df = process_timeline_to_df(ter_timeline, TEXT_TO_UNIFIED)
 
 
 
 
 
158
 
159
- def get_dominant_emotion_from_df(df):
160
- if df.empty or df.sum().sum() == 0: return "N/A"
161
- return df.sum().idxmax().capitalize()
 
 
 
 
 
 
 
 
 
162
 
163
- dominant_fer = get_dominant_emotion_from_df(fer_df)
164
- dominant_ser = get_dominant_emotion_from_df(ser_df)
165
- dominant_text = get_dominant_emotion_from_df(ter_df)
166
 
167
- def get_avg_unified_scores(df):
168
- return df.mean().to_dict() if not df.empty else {}
 
 
 
169
 
170
- fer_avg_scores = get_avg_unified_scores(fer_df)
171
- ser_avg_scores = get_avg_unified_scores(ser_df)
172
- ter_avg_scores = get_avg_unified_scores(ter_df)
173
-
174
- # Use the raw dictionaries for vector creation, not the mapped ones
175
- fer_vector = create_unified_vector(fer_avg_scores, {e:e for e in UNIFIED_EMOTIONS})
176
- ser_vector = create_unified_vector(ser_avg_scores, {e:e for e in UNIFIED_EMOTIONS})
177
- text_vector = create_unified_vector(ter_avg_scores, {e:e for e in UNIFIED_EMOTIONS})
178
-
179
- similarities = [cosine_similarity([fer_vector], [text_vector])[0][0], cosine_similarity([fer_vector], [ser_vector])[0][0], cosine_similarity([ser_vector], [text_vector])[0][0]]
180
- avg_similarity = np.nanmean([s for s in similarities if not np.isnan(s)])
181
-
182
- # --- NEW LAYOUT ---
183
- # Display the full-width transcription first
184
- st.subheader("Transcription")
185
- st.markdown(f"> *{full_transcription}*")
186
- st.divider()
 
 
 
 
187
 
188
- # Now create two columns for the summary and the plot
189
- col1, col2 = st.columns([1, 2])
190
- with col1:
191
- st.subheader("Multimodal Summary")
192
- st.metric("Dominant Facial Emotion", dominant_fer)
193
- st.metric("Dominant Text Emotion", dominant_text)
194
- st.metric("Dominant Speech Emotion", dominant_ser)
195
- st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
196
-
197
- with col2:
198
- st.subheader("Unified Emotion Timeline")
199
 
200
- full_index = np.arange(0, duration, 0.5)
201
- combined_df = pd.DataFrame(index=full_index)
202
 
203
- # --- NEW: ECI Timeline Calculation ---
204
- eci_timeline = {}
205
- for t_stamp in full_index:
206
- vectors = []
207
-
208
- # Interpolate to get a value for any timestamp
209
- fer_scores = fer_df.reindex(fer_df.index.union([t_stamp])).interpolate(method='linear').loc[t_stamp]
210
- if not fer_scores.isnull().all():
211
- vectors.append(create_unified_vector(fer_scores.to_dict(), {e:e for e in UNIFIED_EMOTIONS}))
212
 
213
- if int(t_stamp) in ser_df.index:
214
- vectors.append(create_unified_vector(ser_df.loc[int(t_stamp)].to_dict(), {e:e for e in UNIFIED_EMOTIONS}))
215
-
216
- if int(t_stamp) in ter_df.index:
217
- vectors.append(create_unified_vector(ter_df.loc[int(t_stamp)].to_dict(), {e:e for e in UNIFIED_EMOTIONS}))
218
-
219
- if len(vectors) >= 2:
220
- sims = [cosine_similarity([v1], [v2])[0][0] for i, v1 in enumerate(vectors) for v2 in vectors[i+1:]]
221
- eci_timeline[t_stamp] = np.mean(sims)
 
 
 
 
222
 
223
- if not fer_df.empty:
224
- fer_df_resampled = fer_df.reindex(fer_df.index.union(full_index)).interpolate(method='linear').reindex(full_index)
225
- for e in UNIFIED_EMOTIONS: combined_df[f'Facial_{e}'] = fer_df_resampled.get(e, 0.0)
226
-
227
- if not ser_df.empty:
228
- ser_df_resampled = ser_df.reindex(ser_df.index.union(full_index)).interpolate(method='linear').reindex(full_index)
229
- for e in UNIFIED_EMOTIONS: combined_df[f'Speech_{e}'] = ser_df_resampled.get(e, 0.0)
 
 
 
230
 
231
- if not ter_df.empty:
232
- ter_df_resampled = ter_df.reindex(ter_df.index.union(full_index)).interpolate(method='linear').reindex(full_index)
233
- for e in UNIFIED_EMOTIONS: combined_df[f'Text_{e}'] = ter_df_resampled.get(e, 0.0)
 
 
 
 
 
 
 
234
 
235
- if eci_timeline:
236
- eci_series = pd.Series(eci_timeline).reindex(full_index).interpolate(method='linear')
237
- combined_df['ECI'] = eci_series
238
-
239
- combined_df.fillna(0, inplace=True)
240
 
241
- if not combined_df.empty:
242
- fig, ax = plt.subplots(figsize=(10, 5))
243
- colors = {'happy': 'green', 'sad': 'blue', 'angry': 'red', 'neutral': 'gray'}
244
- styles = {'Facial': '-', 'Speech': '--', 'Text': ':'}
245
-
246
- for col in combined_df.columns:
247
- if col == 'ECI': continue
248
- modality, emotion = col.split('_')
249
- if emotion in colors:
250
- ax.plot(combined_df.index, combined_df[col], label=f'{modality} {emotion.capitalize()}', color=colors[emotion], linestyle=styles[modality], alpha=0.7)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- if 'ECI' in combined_df.columns:
253
- ax.plot(combined_df.index, combined_df['ECI'], label='Emotion Consistency', color='black', linewidth=2.5, alpha=0.9)
254
-
255
- ax.set_title("Emotion Confidence Over Time (Normalized)")
256
- ax.set_xlabel("Time (seconds)")
257
- ax.set_ylabel("Confidence Score (0-1)")
258
- ax.set_ylim(0, 1)
259
- ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
260
- ax.grid(True, which='both', linestyle='--', linewidth=0.5)
261
- plt.tight_layout()
262
- st.pyplot(fig)
263
- else:
264
- st.write("No emotion data available to plot.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- finally:
267
- if 'video_clip_for_duration' in locals() and video_clip_for_duration: video_clip_for_duration.close()
268
- if 'temp_audio_path' in locals() and temp_audio_path and os.path.exists(temp_audio_path): os.unlink(temp_audio_path)
269
- if temp_video_path and os.path.exists(temp_video_path):
270
- time.sleep(1)
271
- try:
272
- os.unlink(temp_video_path)
273
- except Exception:
274
- pass
 
38
  FACIAL_TO_UNIFIED = {'neutral': 'neutral', 'happy': 'happy', 'sad': 'sad', 'angry': 'angry', 'fear':None, 'surprise':None, 'disgust':None}
39
  AUDIO_SAMPLE_RATE = 16000
40
 
41
+ # --- Model Loading (Staged) ---
42
  @st.cache_resource
43
+ def load_audio_models():
44
+ with st.spinner("Loading audio analysis models..."):
45
  whisper_model = whisper.load_model("tiny.en", download_root=os.path.join(CACHE_DIR, "whisper"))
46
  text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
47
  ser_model_name = "superb/hubert-large-superb-er"
 
49
  ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
50
  return whisper_model, text_classifier, ser_model, ser_feature_extractor
51
 
52
+ # Models will be loaded on demand
53
 
54
  # --- Helper Functions for Analysis ---
55
  def create_unified_vector(scores_dict, mapping_dict):
 
72
  if cosine_sim >= 0.3: return "Low"
73
  return "Very Low"
74
 
75
+ # --- Helper Functions for Results Display ---
76
+ def process_timeline_to_df(timeline, mapping):
77
+ if not timeline: return pd.DataFrame(columns=UNIFIED_EMOTIONS)
78
+ df = pd.DataFrame.from_dict(timeline, orient='index')
79
+ df_unified = pd.DataFrame(index=df.index, columns=UNIFIED_EMOTIONS).fillna(0.0)
80
+ for raw_col in df.columns:
81
+ unified_col = mapping.get(raw_col)
82
+ if unified_col:
83
+ df_unified[unified_col] += df[raw_col]
84
+ return df_unified
85
 
86
+ def get_dominant_emotion_from_df(df):
87
+ if df.empty or df.sum().sum() == 0: return "N/A"
88
+ return df.sum().idxmax().capitalize()
89
 
90
+ def get_avg_unified_scores(df):
91
+ return df.mean().to_dict() if not df.empty else {}
 
92
 
93
+ def display_results():
94
+ """Display the final analysis results using data from session state"""
95
+ st.header("Analysis Results")
96
+
97
+ # Get data from session state
98
+ full_transcription = st.session_state.get('full_transcription', 'No speech detected.')
99
+ ser_timeline = st.session_state.get('ser_timeline', {})
100
+ ter_timeline = st.session_state.get('ter_timeline', {})
101
+ fer_timeline = st.session_state.get('fer_timeline', {})
102
+ duration = st.session_state.get('duration', 0)
103
+
104
+ # Process timelines
105
+ fer_df = process_timeline_to_df(fer_timeline, FACIAL_TO_UNIFIED)
106
+ ser_df = process_timeline_to_df(ser_timeline, SER_TO_UNIFIED)
107
+ ter_df = process_timeline_to_df(ter_timeline, TEXT_TO_UNIFIED)
108
+
109
+ # Get dominant emotions
110
+ dominant_fer = get_dominant_emotion_from_df(fer_df)
111
+ dominant_ser = get_dominant_emotion_from_df(ser_df)
112
+ dominant_text = get_dominant_emotion_from_df(ter_df)
113
+
114
+ # Get average scores
115
+ fer_avg_scores = get_avg_unified_scores(fer_df)
116
+ ser_avg_scores = get_avg_unified_scores(ser_df)
117
+ ter_avg_scores = get_avg_unified_scores(ter_df)
118
+
119
+ # Calculate vectors and similarity
120
+ fer_vector = create_unified_vector(fer_avg_scores, {e:e for e in UNIFIED_EMOTIONS})
121
+ ser_vector = create_unified_vector(ser_avg_scores, {e:e for e in UNIFIED_EMOTIONS})
122
+ text_vector = create_unified_vector(ter_avg_scores, {e:e for e in UNIFIED_EMOTIONS})
123
+
124
+ similarities = [cosine_similarity([fer_vector], [text_vector])[0][0], cosine_similarity([fer_vector], [ser_vector])[0][0], cosine_similarity([ser_vector], [text_vector])[0][0]]
125
+ avg_similarity = np.nanmean([s for s in similarities if not np.isnan(s)])
126
+
127
+ # Display transcription
128
+ st.subheader("Transcription")
129
+ st.markdown(f"> *{full_transcription}*")
130
+ st.divider()
131
+
132
+ # Display summary and timeline
133
+ col1, col2 = st.columns([1, 2])
134
+ with col1:
135
+ st.subheader("Multimodal Summary")
136
+ st.metric("Dominant Facial Emotion", dominant_fer)
137
+ st.metric("Dominant Text Emotion", dominant_text)
138
+ st.metric("Dominant Speech Emotion", dominant_ser)
139
+ st.metric("Emotion Consistency", get_consistency_level(avg_similarity), f"{avg_similarity:.2f} Avg. Cosine Similarity")
140
+
141
+ with col2:
142
+ st.subheader("Unified Emotion Timeline")
143
+
144
+ if duration > 0:
145
+ full_index = np.arange(0, duration, 0.5)
146
+ combined_df = pd.DataFrame(index=full_index)
147
 
148
+ # ECI Timeline Calculation
149
+ eci_timeline = {}
150
+ for t_stamp in full_index:
151
+ vectors = []
152
+
153
+ # Interpolate to get a value for any timestamp
154
+ fer_scores = fer_df.reindex(fer_df.index.union([t_stamp])).interpolate(method='linear').loc[t_stamp]
155
+ if not fer_scores.isnull().all():
156
+ vectors.append(create_unified_vector(fer_scores.to_dict(), {e:e for e in UNIFIED_EMOTIONS}))
 
 
 
 
 
 
 
 
 
 
 
157
 
158
+ if int(t_stamp) in ser_df.index:
159
+ vectors.append(create_unified_vector(ser_df.loc[int(t_stamp)].to_dict(), {e:e for e in UNIFIED_EMOTIONS}))
160
+
161
+ if int(t_stamp) in ter_df.index:
162
+ vectors.append(create_unified_vector(ter_df.loc[int(t_stamp)].to_dict(), {e:e for e in UNIFIED_EMOTIONS}))
163
+
164
+ if len(vectors) >= 2:
165
+ sims = [cosine_similarity([v1], [v2])[0][0] for i, v1 in enumerate(vectors) for v2 in vectors[i+1:]]
166
+ eci_timeline[t_stamp] = np.mean(sims)
 
167
 
168
+ if not fer_df.empty:
169
+ fer_df_resampled = fer_df.reindex(fer_df.index.union(full_index)).interpolate(method='linear').reindex(full_index)
170
+ for e in UNIFIED_EMOTIONS: combined_df[f'Facial_{e}'] = fer_df_resampled.get(e, 0.0)
171
+
172
+ if not ser_df.empty:
173
+ ser_df_resampled = ser_df.reindex(ser_df.index.union(full_index)).interpolate(method='linear').reindex(full_index)
174
+ for e in UNIFIED_EMOTIONS: combined_df[f'Speech_{e}'] = ser_df_resampled.get(e, 0.0)
 
 
 
175
 
176
+ if not ter_df.empty:
177
+ ter_df_resampled = ter_df.reindex(ter_df.index.union(full_index)).interpolate(method='linear').reindex(full_index)
178
+ for e in UNIFIED_EMOTIONS: combined_df[f'Text_{e}'] = ter_df_resampled.get(e, 0.0)
 
 
179
 
180
+ if eci_timeline:
181
+ eci_series = pd.Series(eci_timeline).reindex(full_index).interpolate(method='linear')
182
+ combined_df['ECI'] = eci_series
183
+
184
+ combined_df.fillna(0, inplace=True)
185
 
186
+ if not combined_df.empty:
187
+ fig, ax = plt.subplots(figsize=(10, 5))
188
+ colors = {'happy': 'green', 'sad': 'blue', 'angry': 'red', 'neutral': 'gray'}
189
+ styles = {'Facial': '-', 'Speech': '--', 'Text': ':'}
 
 
 
 
 
190
 
191
+ for col in combined_df.columns:
192
+ if col == 'ECI': continue
193
+ modality, emotion = col.split('_')
194
+ if emotion in colors:
195
+ ax.plot(combined_df.index, combined_df[col], label=f'{modality} {emotion.capitalize()}', color=colors[emotion], linestyle=styles[modality], alpha=0.7)
196
+
197
+ if 'ECI' in combined_df.columns:
198
+ ax.plot(combined_df.index, combined_df['ECI'], label='Emotion Consistency', color='black', linewidth=2.5, alpha=0.9)
199
 
200
+ ax.set_title("Emotion Confidence Over Time (Normalized)")
201
+ ax.set_xlabel("Time (seconds)")
202
+ ax.set_ylabel("Confidence Score (0-1)")
203
+ ax.set_ylim(0, 1)
204
+ ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
205
+ ax.grid(True, which='both', linestyle='--', linewidth=0.5)
206
+ plt.tight_layout()
207
+ st.pyplot(fig)
208
+ else:
209
+ st.write("No emotion data available to plot.")
210
+ else:
211
+ st.write("No timeline data available.")
212
 
213
+ # --- Two-Stage UI and Processing Logic ---
214
+ uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi", "mkv"])
 
215
 
216
+ # Initialize session state variables
217
+ if 'temp_video_path' not in st.session_state:
218
+ st.session_state.temp_video_path = None
219
+ if 'uploaded_file_id' not in st.session_state:
220
+ st.session_state.uploaded_file_id = None
221
 
222
+ # Clear previous results when a new file is uploaded
223
+ if uploaded_file is not None:
224
+ file_id = uploaded_file.file_id if hasattr(uploaded_file, 'file_id') else str(hash(uploaded_file.name + str(uploaded_file.size)))
225
+
226
+ if st.session_state.uploaded_file_id != file_id:
227
+ # New file uploaded, clear previous results
228
+ st.session_state.uploaded_file_id = file_id
229
+ for key in ['stage1_complete', 'stage2_complete', 'full_transcription', 'ser_timeline', 'ter_timeline', 'fer_timeline', 'duration']:
230
+ if key in st.session_state:
231
+ del st.session_state[key]
232
+
233
+ # Save the video file
234
+ if st.session_state.temp_video_path and os.path.exists(st.session_state.temp_video_path):
235
+ try:
236
+ os.unlink(st.session_state.temp_video_path)
237
+ except Exception:
238
+ pass
239
+
240
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
241
+ tfile.write(uploaded_file.read())
242
+ st.session_state.temp_video_path = tfile.name
243
 
244
+ if uploaded_file is not None and st.session_state.temp_video_path:
245
+ st.video(st.session_state.temp_video_path)
246
+
247
+ # Stage 1: Audio & Text Analysis
248
+ if not st.session_state.get('stage1_complete', False):
249
+ if st.button("🎡 Step 1: Analyze Audio & Text", type="primary"):
250
+ try:
251
+ # Load audio models
252
+ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_audio_models()
 
 
253
 
254
+ ser_timeline, ter_timeline = {}, {}
255
+ full_transcription = "No speech detected."
256
 
257
+ video_clip = VideoFileClip(st.session_state.temp_video_path)
258
+ duration = video_clip.duration
259
+ st.session_state.duration = duration
260
+
261
+ with st.spinner("Analyzing audio and text..."):
262
+ if video_clip.audio:
263
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
264
+ video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
265
+ temp_audio_path = taudio.name
266
 
267
+ # Transcription
268
+ whisper_result = whisper_model.transcribe(
269
+ temp_audio_path,
270
+ word_timestamps=True,
271
+ fp16=False,
272
+ condition_on_previous_text=False
273
+ )
274
+ full_transcription = whisper_result['text'].strip()
275
+
276
+ # Speech emotion recognition
277
+ audio_array, _ = sf.read(temp_audio_path, dtype='float32')
278
+ if audio_array.ndim == 2:
279
+ audio_array = audio_array.mean(axis=1)
280
 
281
+ for i in range(int(duration)):
282
+ start_sample, end_sample = i * AUDIO_SAMPLE_RATE, (i + 1) * AUDIO_SAMPLE_RATE
283
+ chunk = audio_array[start_sample:end_sample]
284
+
285
+ if len(chunk) > 400:
286
+ inputs = ser_feature_extractor(chunk, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
287
+ with torch.no_grad():
288
+ logits = ser_model(**inputs).logits
289
+ scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
290
+ ser_timeline[i] = {ser_model.config.id2label[k]: score.item() for k, score in enumerate(scores)}
291
 
292
+ # Text emotion recognition
293
+ words_in_segment = [seg['word'] for seg in whisper_result.get('segments', []) if seg['start'] >= i and seg['start'] < i+1 for seg in seg.get('words', [])]
294
+ segment_text = " ".join(words_in_segment).strip()
295
+ if segment_text:
296
+ text_emotions = text_classifier(segment_text)[0]
297
+ ter_timeline[i] = {emo['label']: emo['score'] for emo in text_emotions}
298
+
299
+ # Clean up audio file
300
+ if os.path.exists(temp_audio_path):
301
+ os.unlink(temp_audio_path)
302
 
303
+ video_clip.close()
 
 
 
 
304
 
305
+ # Store results in session state
306
+ st.session_state.full_transcription = full_transcription
307
+ st.session_state.ser_timeline = ser_timeline
308
+ st.session_state.ter_timeline = ter_timeline
309
+ st.session_state.stage1_complete = True
310
+
311
+ st.success("βœ… Audio analysis complete! Speech and text emotions have been analyzed.")
312
+ st.rerun()
313
+
314
+ except Exception as e:
315
+ st.error(f"Error during audio analysis: {str(e)}")
316
+
317
+ else:
318
+ st.success("βœ… Stage 1 (Audio & Text Analysis) - Complete!")
319
+
320
+ # Stage 2: Facial Analysis
321
+ if st.session_state.get('stage1_complete', False) and not st.session_state.get('stage2_complete', False):
322
+ if st.button("😊 Step 2: Analyze Facial Expressions", type="primary"):
323
+ try:
324
+ fer_timeline = {}
325
+
326
+ with st.spinner("Analyzing facial expressions..."):
327
+ cap = cv2.VideoCapture(st.session_state.temp_video_path)
328
+ fps = cap.get(cv2.CAP_PROP_FPS) or 30
329
+ frame_count = 0
330
 
331
+ while cap.isOpened():
332
+ ret, frame = cap.read()
333
+ if not ret:
334
+ break
335
+ timestamp = frame_count / fps
336
+ if frame_count % int(fps) == 0:
337
+ analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
338
+ if isinstance(analysis, list) and len(analysis) > 0:
339
+ fer_timeline[timestamp] = {k: v / 100.0 for k, v in analysis[0]['emotion'].items()}
340
+ frame_count += 1
341
+ cap.release()
342
+
343
+ # Store results in session state
344
+ st.session_state.fer_timeline = fer_timeline
345
+ st.session_state.stage2_complete = True
346
+
347
+ st.success("βœ… Facial analysis complete! All analyses are now finished.")
348
+ st.rerun()
349
+
350
+ except Exception as e:
351
+ st.error(f"Error during facial analysis: {str(e)}")
352
+
353
+ elif st.session_state.get('stage2_complete', False):
354
+ st.success("βœ… Stage 2 (Facial Expression Analysis) - Complete!")
355
+
356
+ # Display results if both stages are complete
357
+ if st.session_state.get('stage1_complete', False) and st.session_state.get('stage2_complete', False):
358
+ display_results()
359
 
360
+ # Cleanup on app restart or when session ends
361
+ if st.session_state.temp_video_path and not uploaded_file:
362
+ try:
363
+ if os.path.exists(st.session_state.temp_video_path):
364
+ os.unlink(st.session_state.temp_video_path)
365
+ st.session_state.temp_video_path = None
366
+ except Exception:
367
+ pass