Amr-h commited on
Commit
757bef4
·
1 Parent(s): 196fb95

add more URL support

Browse files
Files changed (3) hide show
  1. app.py +278 -408
  2. audio_extractor.py +426 -35
  3. requirements.txt +2 -3
app.py CHANGED
@@ -4,516 +4,386 @@ import plotly.express as px
4
  import plotly.graph_objects as go
5
  from plotly.subplots import make_subplots
6
  import time
7
- import re
8
- from datetime import datetime
9
- import numpy as np
10
- from dialect_predector import analyze_video_accent
11
 
12
- # Import your accent analysis function
13
- # from your_accent_module import analyze_video_accent
 
 
 
 
 
14
 
15
  # Page configuration
16
  st.set_page_config(
17
- page_title="🎤 AI Accent Analyzer",
18
  page_icon="🎤",
19
  layout="wide",
20
  initial_sidebar_state="expanded"
21
  )
22
 
23
- # Custom CSS for beautiful styling
24
  st.markdown("""
25
  <style>
26
  .main-header {
27
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
28
- padding: 2rem;
29
- border-radius: 10px;
30
- color: white;
31
  text-align: center;
 
 
 
32
  margin-bottom: 2rem;
33
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
34
  }
35
-
36
- .metric-card {
37
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
38
- padding: 1.5rem;
39
- border-radius: 10px;
40
- box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
41
  margin: 0.5rem 0;
42
- border-left: 4px solid #667eea;
43
  }
44
-
45
- .analysis-section {
46
- background: white;
47
- padding: 1.5rem;
48
- border-radius: 10px;
49
- box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
50
  margin: 1rem 0;
51
- border: 1px solid #e0e6ed;
52
- }
53
-
54
- .accent-tag {
55
- display: inline-block;
56
- padding: 0.3rem 0.8rem;
57
- margin: 0.2rem;
58
- border-radius: 20px;
59
- font-weight: bold;
60
- font-size: 0.9rem;
61
- }
62
-
63
- .accent-primary {
64
- background: linear-gradient(45deg, #667eea, #764ba2);
65
- color: white;
66
- }
67
-
68
- .accent-secondary {
69
- background: linear-gradient(45deg, #ffecd2, #fcb69f);
70
- color: #333;
71
- }
72
-
73
- .processing-animation {
74
- display: flex;
75
- justify-content: center;
76
- align-items: center;
77
- padding: 2rem;
78
  }
79
-
80
- .confidence-bar {
81
- background: linear-gradient(90deg, #ff6b6b, #feca57, #48cae4, #06ffa5);
82
- height: 20px;
83
- border-radius: 10px;
84
- margin: 0.5rem 0;
85
- }
86
-
87
- .chunk-result {
88
- background: #f8f9fa;
89
- border-left: 4px solid #28a745;
90
- padding: 0.8rem;
91
- margin: 0.3rem 0;
92
- border-radius: 5px;
93
- }
94
-
95
- .chunk-result.low-confidence {
96
- border-left-color: #ffc107;
97
  }
98
-
99
- .sidebar-info {
100
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
101
- color: white;
102
  padding: 1rem;
103
- border-radius: 10px;
104
- margin-bottom: 1rem;
105
  }
106
  </style>
107
  """, unsafe_allow_html=True)
108
 
109
- def validate_url(url):
110
- """Validate if the URL is a valid YouTube URL"""
111
- youtube_patterns = [
112
- r'(https?://)?(www\.)?(youtube|youtu|youtube-nocookie)\.(com|be)/',
113
- r'(https?://)?(www\.)?youtube\.com/shorts/',
114
- r'(https?://)?(www\.)?youtu\.be/'
115
- ]
116
-
117
- for pattern in youtube_patterns:
118
- if re.match(pattern, url):
119
- return True
120
- return False
121
 
122
- def create_confidence_gauge(confidence):
123
- """Create a beautiful confidence gauge"""
124
- fig = go.Figure(go.Indicator(
125
- mode = "gauge+number+delta",
126
- value = confidence * 100,
127
- domain = {'x': [0, 1], 'y': [0, 1]},
128
- title = {'text': "Confidence Score"},
129
- delta = {'reference': 70},
130
- gauge = {
131
- 'axis': {'range': [None, 100]},
132
- 'bar': {'color': "darkblue"},
133
- 'steps': [
134
- {'range': [0, 50], 'color': "lightgray"},
135
- {'range': [50, 80], 'color': "yellow"},
136
- {'range': [80, 100], 'color': "green"}
137
- ],
138
- 'threshold': {
139
- 'line': {'color': "red", 'width': 4},
140
- 'thickness': 0.75,
141
- 'value': 90
142
- }
143
- }
144
- ))
145
-
146
- fig.update_layout(height=300, margin=dict(l=20, r=20, t=40, b=20))
147
- return fig
148
 
149
- def create_accent_distribution_chart(accent_counts, title="Accent Distribution"):
150
- """Create a beautiful pie chart for accent distribution"""
151
- if not accent_counts:
152
  return None
153
-
154
- accents = list(accent_counts.keys())
155
- counts = list(accent_counts.values())
156
-
157
- fig = px.pie(
158
- values=counts,
159
- names=accents,
160
- title=title,
161
- color_discrete_sequence=px.colors.qualitative.Set3
162
- )
163
 
164
- fig.update_traces(
165
- textposition='inside',
166
- textinfo='percent+label',
167
- hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percent}<extra></extra>'
168
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  fig.update_layout(
171
- height=400,
172
- margin=dict(l=20, r=20, t=40, b=20),
173
- font=dict(size=12)
 
174
  )
175
 
176
  return fig
177
 
178
- def create_chunk_confidence_chart(chunk_results):
179
- """Create a chart showing confidence over chunks"""
180
- if not chunk_results:
181
  return None
182
-
183
- df = pd.DataFrame(chunk_results)
184
-
185
- fig = px.line(
186
- df,
187
- x='chunk',
188
- y='confidence',
189
- title='Confidence Score Across Audio Chunks',
190
- markers=True,
191
- color='accent',
192
- hover_data=['accent', 'is_confident']
193
- )
194
 
195
- fig.add_hline(y=0.6, line_dash="dash", line_color="red",
196
- annotation_text="Confidence Threshold (60%)")
197
 
198
- fig.update_layout(
199
- height=400,
200
- xaxis_title="Chunk Number",
201
- yaxis_title="Confidence Score",
202
- margin=dict(l=20, r=20, t=40, b=20)
203
- )
 
204
 
205
  return fig
206
 
207
- def create_detailed_analysis(result):
208
- """Create detailed analysis section"""
209
- if not result or not result.get("success"):
 
 
210
  return
211
 
212
- st.markdown('<div class="analysis-section">', unsafe_allow_html=True)
213
- st.markdown("## 📊 Detailed Analysis")
 
214
 
215
  # Key metrics
216
  col1, col2, col3, col4 = st.columns(4)
217
 
218
  with col1:
219
- st.markdown('<div class="metric-card">', unsafe_allow_html=True)
220
  st.metric(
221
- "🎯 Final Accent",
222
- result['predicted_accent'],
223
- f"{result['confidence_percentage']}"
224
  )
225
- st.markdown('</div>', unsafe_allow_html=True)
226
 
227
  with col2:
228
- st.markdown('<div class="metric-card">', unsafe_allow_html=True)
229
  st.metric(
230
- "📦 Chunks Processed",
231
- f"{result['processed_chunks_count']}/{result['available_chunks_count']}",
232
- f"Confident: {result.get('confident_chunks_count', 0)}"
233
  )
234
- st.markdown('</div>', unsafe_allow_html=True)
235
 
236
  with col3:
237
- st.markdown('<div class="metric-card">', unsafe_allow_html=True)
238
  st.metric(
239
- "⏱️ Processing Time",
240
- f"{result['processing_time']:.1f}s",
241
- f"Audio: {result.get('duration_minutes', 0):.1f}min" if result.get('duration_minutes') else ""
242
  )
243
- st.markdown('</div>', unsafe_allow_html=True)
244
 
245
  with col4:
246
- st.markdown('<div class="metric-card">', unsafe_allow_html=True)
247
- early_stopped_text = "Yes ⚡" if result.get('early_stopped') else "No 🔄"
248
  st.metric(
249
- "🛑 Early Stopped",
250
- early_stopped_text,
251
- f"Threshold: {result.get('confidence_threshold', 0.6)*100:.0f}%"
252
  )
253
- st.markdown('</div>', unsafe_allow_html=True)
254
 
255
- st.markdown('</div>', unsafe_allow_html=True)
 
256
 
257
- # Charts section
258
- col1, col2 = st.columns(2)
259
 
260
- with col1:
261
- # Confidence gauge
262
- gauge_fig = create_confidence_gauge(result['confidence_score'])
263
- st.plotly_chart(gauge_fig, use_container_width=True)
264
-
265
- # Accent distribution (confident predictions)
266
- if result.get('confident_accent_counts'):
267
- pie_fig = create_accent_distribution_chart(
268
- result['confident_accent_counts'],
269
- "Confident Predictions Distribution"
270
- )
271
- if pie_fig:
272
- st.plotly_chart(pie_fig, use_container_width=True)
273
 
274
- with col2:
275
- # Chunk confidence over time
276
- if result.get('chunk_results'):
277
- confidence_fig = create_chunk_confidence_chart(result['chunk_results'])
278
- if confidence_fig:
279
- st.plotly_chart(confidence_fig, use_container_width=True)
280
-
281
- # All predictions distribution
282
- if result.get('all_accent_counts') and len(result['all_accent_counts']) > 1:
283
- all_pie_fig = create_accent_distribution_chart(
284
- result['all_accent_counts'],
285
- "All Predictions Distribution"
286
- )
287
- if all_pie_fig:
288
- st.plotly_chart(all_pie_fig, use_container_width=True)
289
-
290
- def display_chunk_details(chunk_results, confidence_threshold=0.6):
291
- """Display detailed chunk-by-chunk results"""
292
- if not chunk_results:
293
- return
294
 
295
- st.markdown("### 🔍 Chunk-by-Chunk Analysis")
 
 
 
296
 
297
  # Summary statistics
298
- confident_chunks = [r for r in chunk_results if r.get('is_confident', r['confidence'] > confidence_threshold)]
299
-
300
- col1, col2, col3 = st.columns(3)
301
- with col1:
302
- st.info(f"**Total Chunks:** {len(chunk_results)}")
303
- with col2:
304
- st.success(f"**Confident Chunks:** {len(confident_chunks)}")
305
- with col3:
306
- confidence_rate = len(confident_chunks) / len(chunk_results) * 100 if chunk_results else 0
307
- st.warning(f"**Confidence Rate:** {confidence_rate:.1f}%")
308
-
309
- # Detailed results
310
- with st.expander("📋 View Detailed Chunk Results", expanded=False):
311
- for i, result in enumerate(chunk_results):
312
- confidence = result['confidence']
313
- is_confident = result.get('is_confident', confidence > confidence_threshold)
314
-
315
- confidence_emoji = "✅" if is_confident else "⚠️"
316
- confidence_class = "" if is_confident else "low-confidence"
317
-
318
- st.markdown(f"""
319
- <div class="chunk-result {confidence_class}">
320
- <strong>Chunk {result['chunk']}</strong> {confidence_emoji}<br>
321
- <strong>Accent:</strong> {result['accent']}<br>
322
- <strong>Confidence:</strong> {confidence:.3f} ({confidence*100:.1f}%)<br>
323
- <strong>Status:</strong> {'Confident' if is_confident else 'Low Confidence'}
324
- </div>
325
- """, unsafe_allow_html=True)
326
 
327
  def main():
 
 
 
328
  # Header
329
- st.markdown("""
330
- <div class="main-header">
331
- <h1>🎤 AI Accent Analyzer</h1>
332
- <p>Analyze accents from YouTube videos using advanced AI models</p>
333
- </div>
334
- """, unsafe_allow_html=True)
335
-
336
- # Sidebar
337
- with st.sidebar:
338
- st.markdown("""
339
- <div class="sidebar-info">
340
- <h3>🔧 Configuration</h3>
341
- <p>Adjust analysis parameters</p>
342
- </div>
343
- """, unsafe_allow_html=True)
344
-
345
- confidence_threshold = st.slider(
346
- "🎯 Confidence Threshold",
347
- min_value=0.1,
348
- max_value=0.9,
349
- value=0.6,
350
- step=0.05,
351
- help="Only predictions above this confidence level are considered reliable"
352
- )
353
-
354
- early_stopping_threshold = st.slider(
355
- "⚡ Early Stopping Threshold",
356
- min_value=2,
357
- max_value=10,
358
- value=3,
359
- help="Stop processing after this many consecutive confident predictions"
360
- )
361
-
362
- st.markdown("---")
363
-
364
- st.markdown("""
365
- ### 📋 Supported Formats
366
- - YouTube videos
367
- - YouTube Shorts
368
- - YouTube Music
369
- - Youtu.be links
370
-
371
- ### ⚙️ How it works
372
- 1. **Audio Extraction**: Extracts audio from video
373
- 2. **Chunking**: Splits audio into manageable segments
374
- 3. **AI Analysis**: Uses SpeechBrain model for accent detection
375
- 4. **Confidence Filtering**: Only considers high-confidence predictions
376
- 5. **Results**: Provides detailed analysis and visualization
377
- """)
378
 
379
- # Main interface
380
- st.markdown("## 🔗 Enter Video URL")
 
 
 
381
 
382
- # URL input with examples
383
- col1, col2 = st.columns([3, 1])
384
 
385
- with col1:
386
- video_url = st.text_input(
387
- "YouTube Video URL",
388
- placeholder="https://www.youtube.com/watch?v=example or https://youtu.be/example",
389
- help="Paste any YouTube video URL here"
390
- )
391
 
392
- with col2:
393
- st.markdown("**Quick Examples:**")
394
- example_urls = [
395
- "https://www.youtube.com/shorts/mxMzNp3RfpA",
396
- "https://youtu.be/dQw4w9WgXcQ",
397
- "https://www.youtube.com/watch?v=example"
398
- ]
 
399
 
400
- for i, url in enumerate(example_urls):
401
- if st.button(f"Example {i+1}", key=f"example_{i}"):
402
- st.session_state.example_url = url
403
- st.rerun()
404
-
405
- # Use example URL if selected
406
- if hasattr(st.session_state, 'example_url'):
407
- video_url = st.session_state.example_url
408
- delattr(st.session_state, 'example_url')
409
-
410
- # URL validation
411
- if video_url:
412
- if validate_url(video_url):
413
- st.success("✅ Valid YouTube URL detected!")
414
- else:
415
- st.error("❌ Please enter a valid YouTube URL")
416
- st.stop()
 
 
 
 
 
 
 
417
 
418
  # Analysis button
419
- if st.button("🚀 Analyze Accent", type="primary", disabled=not video_url):
420
- if not video_url:
421
- st.warning("Please enter a video URL first!")
422
- return
 
 
 
 
 
 
423
 
424
  # Progress tracking
425
  progress_bar = st.progress(0)
426
  status_text = st.empty()
427
 
428
  try:
429
- # Simulate the analysis process with progress updates
430
- status_text.text("🔄 Initializing analysis...")
431
- progress_bar.progress(10)
432
- time.sleep(1)
433
-
434
- status_text.text("🎵 Extracting audio from video...")
435
- progress_bar.progress(30)
436
- time.sleep(1)
437
 
438
  status_text.text("🧠 Loading AI model...")
439
- progress_bar.progress(50)
440
- time.sleep(1)
441
-
442
- status_text.text("🔍 Analyzing accent patterns...")
443
- progress_bar.progress(80)
444
 
445
- # Here you would call your actual analysis function
446
- # result = analyze_video_accent(video_url, confidence_threshold)
447
 
448
- # For demo purposes, creating mock result
449
- result = analyze_video_accent(video_url, confidence_threshold)
450
 
451
  progress_bar.progress(100)
452
  status_text.text("✅ Analysis complete!")
453
- time.sleep(0.5)
454
 
455
- # Clear progress indicators
 
 
 
 
456
  progress_bar.empty()
457
  status_text.empty()
458
 
459
- # Display results
460
- if result["success"]:
461
- st.success("🎉 Analysis completed successfully!")
462
-
463
- # Main result highlight
464
- st.markdown(f"""
465
- <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
466
- color: white; padding: 2rem; border-radius: 15px; text-align: center; margin: 2rem 0;">
467
- <h2>🎤 Detected Accent: {result['predicted_accent']}</h2>
468
- <h3>📊 Confidence: {result['confidence_percentage']}</h3>
469
- </div>
470
- """, unsafe_allow_html=True)
471
-
472
- # Detailed analysis
473
- create_detailed_analysis(result)
474
-
475
- # Chunk details
476
- if result.get('chunk_results'):
477
- display_chunk_details(result['chunk_results'], confidence_threshold)
478
-
479
- # Raw data download
480
- with st.expander("📥 Download Results", expanded=False):
481
- # Convert results to DataFrame for download
482
- if result.get('chunk_results'):
483
- df = pd.DataFrame(result['chunk_results'])
484
- csv = df.to_csv(index=False)
485
- st.download_button(
486
- label="📊 Download Chunk Results (CSV)",
487
- data=csv,
488
- file_name=f"accent_analysis_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv",
489
- mime="text/csv"
490
- )
491
-
492
- # JSON download
493
- import json
494
- json_str = json.dumps(result, indent=2, default=str)
495
- st.download_button(
496
- label="📋 Download Full Results (JSON)",
497
- data=json_str,
498
- file_name=f"accent_analysis_full_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
499
- mime="application/json"
500
- )
501
- else:
502
- st.error(f"❌ Analysis failed: {result.get('error', 'Unknown error')}")
503
-
504
  except Exception as e:
 
505
  progress_bar.empty()
506
  status_text.empty()
507
- st.error(f"❌ An error occurred during analysis: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
  # Footer
510
  st.markdown("---")
511
- st.markdown("""
512
- <div style="text-align: center; color: #666; margin-top: 2rem;">
513
- <p>🎤 AI Accent Analyzer | Built with Streamlit & SpeechBrain</p>
514
- <p>Analyze accents from YouTube videos with confidence-based filtering</p>
515
- </div>
516
- """, unsafe_allow_html=True)
517
 
518
  if __name__ == "__main__":
519
  main()
 
4
  import plotly.graph_objects as go
5
  from plotly.subplots import make_subplots
6
  import time
7
+ import os
8
+ from pathlib import Path
9
+ import tempfile
10
+ import shutil
11
 
12
+ # Import your existing modules
13
+ try:
14
+ from audio_extractor import prepare_audio
15
+ from dialect_predector import analyze_video_accent
16
+ except ImportError as e:
17
+ st.error(f"Error importing modules: {e}")
18
+ st.stop()
19
 
20
  # Page configuration
21
  st.set_page_config(
22
+ page_title="🎤 Accent Analyzer",
23
  page_icon="🎤",
24
  layout="wide",
25
  initial_sidebar_state="expanded"
26
  )
27
 
28
+ # Custom CSS for better styling
29
  st.markdown("""
30
  <style>
31
  .main-header {
 
 
 
 
32
  text-align: center;
33
+ color: #1f77b4;
34
+ font-size: 3rem;
35
+ font-weight: bold;
36
  margin-bottom: 2rem;
 
37
  }
38
+ .metric-container {
39
+ background-color: #f0f2f6;
40
+ padding: 1rem;
41
+ border-radius: 0.5rem;
 
 
42
  margin: 0.5rem 0;
 
43
  }
44
+ .success-box {
45
+ background-color: #d4edda;
46
+ border: 1px solid #c3e6cb;
47
+ color: #155724;
48
+ padding: 1rem;
49
+ border-radius: 0.5rem;
50
  margin: 1rem 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  }
52
+ .error-box {
53
+ background-color: #f8d7da;
54
+ border: 1px solid #f5c6cb;
55
+ color: #721c24;
56
+ padding: 1rem;
57
+ border-radius: 0.5rem;
58
+ margin: 1rem 0;
 
 
 
 
 
 
 
 
 
 
 
59
  }
60
+ .info-box {
61
+ background-color: #d1ecf1;
62
+ border: 1px solid #bee5eb;
63
+ color: #0c5460;
64
  padding: 1rem;
65
+ border-radius: 0.5rem;
66
+ margin: 1rem 0;
67
  }
68
  </style>
69
  """, unsafe_allow_html=True)
70
 
71
+ def initialize_session_state():
72
+ """Initialize session state variables"""
73
+ if 'analysis_results' not in st.session_state:
74
+ st.session_state.analysis_results = None
75
+ if 'processing' not in st.session_state:
76
+ st.session_state.processing = False
77
+ if 'uploaded_file_path' not in st.session_state:
78
+ st.session_state.uploaded_file_path = None
 
 
 
 
79
 
80
+ def save_uploaded_file(uploaded_file):
81
+ """Save uploaded file to temporary directory"""
82
+ try:
83
+ temp_dir = tempfile.mkdtemp()
84
+ file_path = os.path.join(temp_dir, uploaded_file.name)
85
+ with open(file_path, "wb") as f:
86
+ f.write(uploaded_file.getbuffer())
87
+ return file_path
88
+ except Exception as e:
89
+ st.error(f"Error saving uploaded file: {e}")
90
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ def create_confidence_chart(chunk_results):
93
+ """Create confidence score chart for chunks"""
94
+ if not chunk_results:
95
  return None
 
 
 
 
 
 
 
 
 
 
96
 
97
+ chunk_data = []
98
+ for result in chunk_results:
99
+ chunk_data.append({
100
+ 'Chunk': result['chunk'],
101
+ 'Confidence': result['confidence'],
102
+ 'Accent': result['accent'],
103
+ 'Is Confident': '✓ Confident' if result['is_confident'] else '✗ Low Confidence'
104
+ })
105
+
106
+ df = pd.DataFrame(chunk_data)
107
+
108
+ fig = px.bar(df,
109
+ x='Chunk',
110
+ y='Confidence',
111
+ color='Is Confident',
112
+ hover_data=['Accent'],
113
+ title='Confidence Scores by Chunk',
114
+ color_discrete_map={'✓ Confident': '#28a745', '✗ Low Confidence': '#dc3545'})
115
 
116
  fig.update_layout(
117
+ xaxis_title="Chunk Number",
118
+ yaxis_title="Confidence Score",
119
+ showlegend=True,
120
+ height=400
121
  )
122
 
123
  return fig
124
 
125
+ def create_accent_distribution_chart(accent_counts, title="Accent Distribution"):
126
+ """Create pie chart for accent distribution"""
127
+ if not accent_counts:
128
  return None
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ accents = list(accent_counts.keys())
131
+ counts = list(accent_counts.values())
132
 
133
+ fig = px.pie(values=counts,
134
+ names=accents,
135
+ title=title,
136
+ color_discrete_sequence=px.colors.qualitative.Set3)
137
+
138
+ fig.update_traces(textposition='inside', textinfo='percent+label')
139
+ fig.update_layout(height=400)
140
 
141
  return fig
142
 
143
+ def display_results(results):
144
+ """Display analysis results with charts and metrics"""
145
+ if not results['success']:
146
+ st.markdown(f'<div class="error-box">❌ <strong>Error:</strong> {results["error"]}</div>',
147
+ unsafe_allow_html=True)
148
  return
149
 
150
+ # Main result
151
+ st.markdown(f'<div class="success-box">🎤 <strong>Detected Accent:</strong> {results["predicted_accent"]}</div>',
152
+ unsafe_allow_html=True)
153
 
154
  # Key metrics
155
  col1, col2, col3, col4 = st.columns(4)
156
 
157
  with col1:
 
158
  st.metric(
159
+ label="🎯 Confidence Score",
160
+ value=f"{results['confidence_score']:.3f}",
161
+ delta=f"{results['confidence_percentage']}"
162
  )
 
163
 
164
  with col2:
 
165
  st.metric(
166
+ label="📊 Chunks Processed",
167
+ value=f"{results['processed_chunks_count']}/{results['available_chunks_count']}",
168
+ delta="Early stopped" if results.get('early_stopped', False) else "Complete"
169
  )
 
170
 
171
  with col3:
 
172
  st.metric(
173
+ label=" Confident Predictions",
174
+ value=results['confident_chunks_count'],
175
+ delta=f"{(results['confident_chunks_count']/results['processed_chunks_count']*100):.1f}%"
176
  )
 
177
 
178
  with col4:
 
 
179
  st.metric(
180
+ label="⏱️ Processing Time",
181
+ value=f"{results['processing_time']:.1f}s",
182
+ delta=f"{results.get('duration_minutes', 0):.1f}min video"
183
  )
 
184
 
185
+ # Detailed Analysis
186
+ st.subheader("📈 Detailed Analysis")
187
 
188
+ # Create two columns for charts
189
+ chart_col1, chart_col2 = st.columns(2)
190
 
191
+ # Confidence chart
192
+ with chart_col1:
193
+ confidence_chart = create_confidence_chart(results['chunk_results'])
194
+ if confidence_chart:
195
+ st.plotly_chart(confidence_chart, use_container_width=True)
 
 
 
 
 
 
 
 
196
 
197
+ # Accent distribution for confident predictions
198
+ with chart_col2:
199
+ confident_chart = create_accent_distribution_chart(
200
+ results['confident_accent_counts'],
201
+ "Confident Predictions Distribution"
202
+ )
203
+ if confident_chart:
204
+ st.plotly_chart(confident_chart, use_container_width=True)
205
+
206
+ # All predictions distribution
207
+ if results['all_accent_counts'] != results['confident_accent_counts']:
208
+ st.subheader("📊 All Predictions (Including Low Confidence)")
209
+ all_chart = create_accent_distribution_chart(
210
+ results['all_accent_counts'],
211
+ "All Predictions Distribution"
212
+ )
213
+ if all_chart:
214
+ st.plotly_chart(all_chart, use_container_width=True)
 
 
215
 
216
+ # Detailed chunk results table
217
+ with st.expander("🔍 View Detailed Chunk Results"):
218
+ chunk_df = pd.DataFrame(results['chunk_results'])
219
+ st.dataframe(chunk_df, use_container_width=True)
220
 
221
  # Summary statistics
222
+ with st.expander("📋 Summary Statistics"):
223
+ col1, col2 = st.columns(2)
224
+
225
+ with col1:
226
+ st.write("**Confident Predictions:**")
227
+ for accent, count in results['confident_accent_counts'].items():
228
+ percentage = (count / results['confident_chunks_count']) * 100
229
+ st.write(f"• {accent}: {count} chunks ({percentage:.1f}%)")
230
+
231
+ with col2:
232
+ st.write("**All Predictions:**")
233
+ for accent, count in results['all_accent_counts'].items():
234
+ percentage = (count / results['processed_chunks_count']) * 100
235
+ st.write(f"• {accent}: {count} chunks ({percentage:.1f}%)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
  def main():
238
+ """Main Streamlit application"""
239
+ initialize_session_state()
240
+
241
  # Header
242
+ st.markdown('<h1 class="main-header">🎤 Accent Analyzer</h1>', unsafe_allow_html=True)
243
+ st.markdown("Analyze accents from video files, URLs, or audio sources using advanced AI models.")
244
+
245
+ # Sidebar configuration
246
+ st.sidebar.header("⚙️ Configuration")
247
+
248
+ confidence_threshold = st.sidebar.slider(
249
+ "Confidence Threshold",
250
+ min_value=0.1,
251
+ max_value=0.9,
252
+ value=0.6,
253
+ step=0.05,
254
+ help="Only predictions above this threshold are considered confident"
255
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
+ early_stopping = st.sidebar.checkbox(
258
+ "Enable Early Stopping",
259
+ value=True,
260
+ help="Stop processing when 3 consecutive confident predictions agree"
261
+ )
262
 
263
+ # Input section
264
+ st.header("📥 Input Source")
265
 
266
+ input_method = st.radio(
267
+ "Choose input method:",
268
+ ["URL (YouTube, Loom, etc.)", "Upload File"],
269
+ horizontal=True
270
+ )
 
271
 
272
+ source = None
273
+
274
+ if input_method == "URL (YouTube, Loom, etc.)":
275
+ source = st.text_input(
276
+ "Enter video URL:",
277
+ placeholder="https://www.youtube.com/watch?v=...",
278
+ help="Supports YouTube, Loom, and direct media URLs"
279
+ )
280
 
281
+ # URL examples
282
+ with st.expander("🔗 Supported URL Examples"):
283
+ st.write("• YouTube: `https://www.youtube.com/watch?v=VIDEO_ID`")
284
+ st.write("• YouTube Shorts: `https://www.youtube.com/shorts/VIDEO_ID`")
285
+ st.write("• Loom: `https://www.loom.com/share/VIDEO_ID`")
286
+ st.write("• Direct media files: `https://example.com/video.mp4`")
287
+
288
+ else: # Upload File
289
+ uploaded_file = st.file_uploader(
290
+ "Choose a video or audio file",
291
+ type=['mp4', 'webm', 'avi', 'mov', 'mkv', 'm4v', '3gp', 'mp3', 'wav', 'm4a', 'aac', 'ogg', 'flac'],
292
+ help="Upload video or audio files for accent analysis"
293
+ )
294
+
295
+ if uploaded_file is not None:
296
+ # Save uploaded file
297
+ with st.spinner("Saving uploaded file..."):
298
+ source = save_uploaded_file(uploaded_file)
299
+ st.session_state.uploaded_file_path = source
300
+
301
+ if source:
302
+ st.success(f"✅ File uploaded: {uploaded_file.name}")
303
+ else:
304
+ st.error("❌ Failed to save uploaded file")
305
 
306
  # Analysis button
307
+ analyze_button = st.button(
308
+ "🚀 Start Analysis",
309
+ type="primary",
310
+ disabled=not source or st.session_state.processing,
311
+ use_container_width=True
312
+ )
313
+
314
+ # Process analysis
315
+ if analyze_button and source:
316
+ st.session_state.processing = True
317
 
318
  # Progress tracking
319
  progress_bar = st.progress(0)
320
  status_text = st.empty()
321
 
322
  try:
323
+ status_text.text("🎵 Extracting audio...")
324
+ progress_bar.progress(20)
 
 
 
 
 
 
325
 
326
  status_text.text("🧠 Loading AI model...")
327
+ progress_bar.progress(40)
 
 
 
 
328
 
329
+ status_text.text("🔍 Analyzing accent...")
330
+ progress_bar.progress(60)
331
 
332
+ # Run analysis
333
+ results = analyze_video_accent(source, confidence_threshold=confidence_threshold)
334
 
335
  progress_bar.progress(100)
336
  status_text.text("✅ Analysis complete!")
 
337
 
338
+ # Store results in session state
339
+ st.session_state.analysis_results = results
340
+
341
+ # Clean up progress indicators
342
+ time.sleep(1)
343
  progress_bar.empty()
344
  status_text.empty()
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  except Exception as e:
347
+ st.error(f"❌ Analysis failed: {str(e)}")
348
  progress_bar.empty()
349
  status_text.empty()
350
+
351
+ finally:
352
+ st.session_state.processing = False
353
+
354
+ # Display results
355
+ if st.session_state.analysis_results:
356
+ st.header("📊 Results")
357
+ display_results(st.session_state.analysis_results)
358
+
359
+ # Information section
360
+ with st.expander("ℹ️ About This Tool"):
361
+ st.markdown("""
362
+ **Accent Analyzer** uses advanced machine learning models to identify accents from speech in videos and audio files.
363
+
364
+ **Features:**
365
+ - Supports multiple input sources (URLs, file uploads)
366
+ - Smart chunking for efficient processing
367
+ - Confidence-based predictions
368
+ - Early stopping for faster results
369
+ - Detailed analysis with visualizations
370
+
371
+ **Supported Formats:**
372
+ - **Video:** MP4, WebM, AVI, MOV, MKV, M4V, 3GP
373
+ - **Audio:** MP3, WAV, M4A, AAC, OGG, FLAC
374
+ - **URLs:** YouTube, Loom, direct media links
375
+
376
+ **How it works:**
377
+ 1. Audio is extracted from the source
378
+ 2. Audio is chunked into smaller segments
379
+ 3. Each chunk is analyzed for accent features
380
+ 4. Results are aggregated with confidence scoring
381
+ 5. Final prediction is made based on confident predictions
382
+ """)
383
 
384
  # Footer
385
  st.markdown("---")
386
+ st.markdown("Made with ❤️ using Streamlit and SpeechBrain")
 
 
 
 
 
387
 
388
  if __name__ == "__main__":
389
  main()
audio_extractor.py CHANGED
@@ -5,6 +5,9 @@ import warnings
5
  import time
6
  import shutil
7
  import random
 
 
 
8
 
9
  import torch
10
  import torchaudio
@@ -27,35 +30,430 @@ def suppress_stdout_stderr():
27
  sys.stdout = old_stdout
28
  sys.stderr = old_stderr
29
 
30
- def extract_audio_from_video_url(video_url):
31
- start_time = time.time()
32
- temp_dir = tempfile.mkdtemp()
33
- ydl_opts = {
34
- 'format': 'bestaudio[abr<=64]',
35
- 'postprocessors': [{
36
- 'key': 'FFmpegExtractAudio',
37
- 'preferredcodec': 'wav',
38
- 'preferredquality': '192',
39
- }],
40
- 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
41
- 'quiet': True,
42
- 'no_warnings': True,
43
- 'noplaylist': True,
44
- }
45
-
46
- with suppress_stdout_stderr():
47
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
48
- ydl.download([video_url])
49
-
50
- for file in os.listdir(temp_dir):
51
- if file.endswith('.wav'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  end_time = time.time()
53
- print(f"[⏱️] Audio extraction took {end_time - start_time:.2f} seconds.")
54
- return os.path.join(temp_dir, file)
55
- raise Exception("Failed to extract audio in WAV format")
 
 
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
59
  def smart_chunk_audio(waveform, sample_rate, duration_minutes):
60
  """Smart chunking based on video duration"""
61
  total_duration = waveform.size(1) / sample_rate
@@ -136,11 +534,11 @@ def chunk_audio_strategic(waveform, sample_rate, chunk_length_sec=25):
136
  print(f"📦 Strategic sampling: {len(chunks)} chunks from long video")
137
  return chunks
138
 
139
- def prepare_audio(video_url):
140
  """Main function to extract and prepare audio chunks"""
141
  try:
142
- print(f"🎵 Extracting audio from video...")
143
- audio_path = extract_audio_from_video_url(video_url)
144
  print(f"✅ Audio extracted to: {audio_path}")
145
 
146
  print(f"🎯 Loading and preparing audio...")
@@ -159,13 +557,6 @@ def prepare_audio(video_url):
159
  end = time.time()
160
  print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
161
 
162
- # # Apply simple VAD
163
- # print(f"🎤 Applying Voice Activity Detection...")
164
- # start = time.time()
165
- # waveform = simple_vad(waveform, sample_rate)
166
- # end = time.time()
167
- # print(f"[⏱️] VAD took {end - start:.2f} seconds.")
168
-
169
  # Calculate duration and apply smart chunking
170
  duration_minutes = waveform.size(1) / sample_rate / 60
171
 
 
5
  import time
6
  import shutil
7
  import random
8
+ import requests
9
+ from urllib.parse import urlparse, unquote
10
+ from pathlib import Path
11
 
12
  import torch
13
  import torchaudio
 
30
  sys.stdout = old_stdout
31
  sys.stderr = old_stderr
32
 
33
+ class RobustAudioExtractor:
34
+ def __init__(self):
35
+ self.supported_video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv', '.m4v', '.3gp']
36
+ self.supported_audio_formats = ['.mp3', '.wav', '.m4a', '.aac', '.ogg', '.flac']
37
+ self.user_agents = [
38
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
39
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
40
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
41
+ ]
42
+
43
+ def extract_audio_from_source(self, source):
44
+ """
45
+ Extract audio from various sources:
46
+ - File path (uploaded file)
47
+ - Direct media URL (MP4, etc.)
48
+ - Loom URL
49
+ - Other video hosting URLs
50
+ """
51
+ start_time = time.time()
52
+
53
+ # Check if source is a file path
54
+ if self._is_file_path(source):
55
+ print(f"📁 Processing uploaded file: {source}")
56
+ return self._process_local_file(source, start_time)
57
+
58
+ # Check if source is a direct media URL
59
+ if self._is_direct_media_url(source):
60
+ print(f"🔗 Processing direct media URL: {source}")
61
+ return self._download_direct_media(source, start_time)
62
+
63
+ # Check if source is a Loom URL
64
+ if self._is_loom_url(source):
65
+ print(f"🎥 Processing Loom URL: {source}")
66
+ return self._extract_from_loom(source, start_time)
67
+
68
+ # Try with yt-dlp for other platforms (with robust error handling)
69
+ print(f"🌐 Processing URL with yt-dlp: {source}")
70
+ return self._extract_with_ytdlp_robust(source, start_time)
71
+
72
+ def _is_file_path(self, source):
73
+ """Check if source is a local file path"""
74
+ try:
75
+ path = Path(source)
76
+ return path.exists() and path.is_file()
77
+ except:
78
+ return False
79
+
80
+ def _is_direct_media_url(self, url):
81
+ """Check if URL points directly to a media file"""
82
+ try:
83
+ parsed = urlparse(url.lower())
84
+ path = unquote(parsed.path)
85
+ return any(path.endswith(ext) for ext in self.supported_video_formats + self.supported_audio_formats)
86
+ except:
87
+ return False
88
+
89
+ def _is_loom_url(self, url):
90
+ """Check if URL is a Loom video"""
91
+ return 'loom.com' in url.lower()
92
+
93
+ def _process_local_file(self, file_path, start_time):
94
+ """Process a local file (uploaded file)"""
95
+ try:
96
+ file_ext = Path(file_path).suffix.lower()
97
+
98
+ # If it's already an audio file, just return it
99
+ if file_ext in self.supported_audio_formats:
100
+ if file_ext == '.wav':
101
+ end_time = time.time()
102
+ print(f"[⏱️] Audio file processing took {end_time - start_time:.2f} seconds.")
103
+ return file_path
104
+ else:
105
+ # Convert to WAV
106
+ return self._convert_to_wav(file_path, start_time)
107
+
108
+ # If it's a video file, extract audio
109
+ elif file_ext in self.supported_video_formats:
110
+ return self._extract_audio_from_video_file(file_path, start_time)
111
+
112
+ else:
113
+ raise Exception(f"Unsupported file format: {file_ext}")
114
+
115
+ except Exception as e:
116
+ raise Exception(f"Failed to process local file: {str(e)}")
117
+
118
+ def _download_direct_media(self, url, start_time):
119
+ """Download direct media URL"""
120
+ temp_dir = tempfile.mkdtemp()
121
+
122
+ try:
123
+ headers = {
124
+ 'User-Agent': random.choice(self.user_agents),
125
+ 'Accept': '*/*',
126
+ 'Accept-Language': 'en-US,en;q=0.9',
127
+ 'Accept-Encoding': 'gzip, deflate, br',
128
+ 'Connection': 'keep-alive',
129
+ 'Upgrade-Insecure-Requests': '1',
130
+ }
131
+
132
+ response = requests.get(url, headers=headers, stream=True, timeout=30)
133
+ response.raise_for_status()
134
+
135
+ # Determine file extension
136
+ content_type = response.headers.get('content-type', '').lower()
137
+ if 'video' in content_type:
138
+ if 'mp4' in content_type:
139
+ ext = '.mp4'
140
+ elif 'webm' in content_type:
141
+ ext = '.webm'
142
+ else:
143
+ ext = '.mp4' # default
144
+ elif 'audio' in content_type:
145
+ if 'mpeg' in content_type or 'mp3' in content_type:
146
+ ext = '.mp3'
147
+ elif 'wav' in content_type:
148
+ ext = '.wav'
149
+ else:
150
+ ext = '.mp3' # default
151
+ else:
152
+ # Try to get from URL
153
+ parsed_url = urlparse(url)
154
+ url_ext = Path(parsed_url.path).suffix.lower()
155
+ ext = url_ext if url_ext in self.supported_video_formats + self.supported_audio_formats else '.mp4'
156
+
157
+ downloaded_file = os.path.join(temp_dir, f'downloaded{ext}')
158
+
159
+ with open(downloaded_file, 'wb') as f:
160
+ for chunk in response.iter_content(chunk_size=8192):
161
+ if chunk:
162
+ f.write(chunk)
163
+
164
+ print(f"✅ Downloaded {os.path.getsize(downloaded_file) / 1024 / 1024:.1f}MB")
165
+
166
+ # Process the downloaded file
167
+ if ext in self.supported_audio_formats:
168
+ if ext == '.wav':
169
+ end_time = time.time()
170
+ print(f"[⏱️] Direct audio download took {end_time - start_time:.2f} seconds.")
171
+ return downloaded_file
172
+ else:
173
+ return self._convert_to_wav(downloaded_file, start_time)
174
+ else:
175
+ return self._extract_audio_from_video_file(downloaded_file, start_time)
176
+
177
+ except Exception as e:
178
+ if os.path.exists(temp_dir):
179
+ shutil.rmtree(temp_dir, ignore_errors=True)
180
+ raise Exception(f"Failed to download direct media: {str(e)}")
181
+
182
+ def _extract_from_loom(self, url, start_time):
183
+ """Extract audio from Loom with multiple strategies"""
184
+ strategies = [
185
+ self._loom_strategy_basic,
186
+ self._loom_strategy_embed,
187
+ self._loom_strategy_api,
188
+ ]
189
+
190
+ for i, strategy in enumerate(strategies):
191
+ try:
192
+ print(f"Trying Loom strategy {i+1}...")
193
+ result = strategy(url, start_time)
194
+ if result:
195
+ return result
196
+ time.sleep(1) # Brief delay between strategies
197
+ except Exception as e:
198
+ print(f"Loom strategy {i+1} failed: {str(e)}")
199
+ continue
200
+
201
+ raise Exception("Failed to extract audio from Loom URL with all strategies")
202
+
203
+ def _loom_strategy_basic(self, url, start_time):
204
+ """Basic Loom extraction using yt-dlp"""
205
+ temp_dir = tempfile.mkdtemp()
206
+ ydl_opts = {
207
+ 'format': 'bestaudio[abr<=128]/best[height<=720]',
208
+ 'postprocessors': [{
209
+ 'key': 'FFmpegExtractAudio',
210
+ 'preferredcodec': 'wav',
211
+ 'preferredquality': '192',
212
+ }],
213
+ 'outtmpl': os.path.join(temp_dir, 'loom_audio.%(ext)s'),
214
+ 'quiet': True,
215
+ 'no_warnings': True,
216
+ 'noplaylist': True,
217
+ 'http_headers': {
218
+ 'User-Agent': random.choice(self.user_agents)
219
+ }
220
+ }
221
+
222
+ with suppress_stdout_stderr():
223
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
224
+ ydl.download([url])
225
+
226
+ return self._find_audio_file(temp_dir, start_time)
227
+
228
+ def _loom_strategy_embed(self, url, start_time):
229
+ """Try Loom embed URL format"""
230
+ # Extract video ID from Loom URL
231
+ import re
232
+ loom_id_match = re.search(r'loom\.com/share/([a-zA-Z0-9]+)', url)
233
+ if loom_id_match:
234
+ video_id = loom_id_match.group(1)
235
+ embed_url = f"https://www.loom.com/embed/{video_id}"
236
+ return self._loom_strategy_basic(embed_url, start_time)
237
+ return None
238
+
239
+ def _loom_strategy_api(self, url, start_time):
240
+ """Try to get direct video URL from Loom"""
241
+ # This is a placeholder for a more sophisticated approach
242
+ # You might need to inspect Loom's network requests to find direct video URLs
243
+ return None
244
+
245
+ def _extract_with_ytdlp_robust(self, url, start_time):
246
+ """Robust yt-dlp extraction with multiple strategies"""
247
+ strategies = [
248
+ self._ytdlp_strategy_basic,
249
+ self._ytdlp_strategy_with_headers,
250
+ self._ytdlp_strategy_low_quality,
251
+ self._ytdlp_strategy_audio_only,
252
+ ]
253
+
254
+ for i, strategy in enumerate(strategies):
255
+ try:
256
+ print(f"Trying yt-dlp strategy {i+1}...")
257
+ result = strategy(url, start_time)
258
+ if result:
259
+ return result
260
+ time.sleep(random.uniform(1, 3))
261
+ except Exception as e:
262
+ print(f"yt-dlp strategy {i+1} failed: {str(e)}")
263
+ continue
264
+
265
+ raise Exception("Failed to extract audio with all yt-dlp strategies")
266
+
267
+ def _ytdlp_strategy_basic(self, url, start_time):
268
+ """Basic yt-dlp strategy"""
269
+ temp_dir = tempfile.mkdtemp()
270
+ ydl_opts = {
271
+ 'format': 'bestaudio[abr<=64]/worst',
272
+ 'postprocessors': [{
273
+ 'key': 'FFmpegExtractAudio',
274
+ 'preferredcodec': 'wav',
275
+ 'preferredquality': '192',
276
+ }],
277
+ 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
278
+ 'quiet': True,
279
+ 'no_warnings': True,
280
+ 'noplaylist': True,
281
+ }
282
+
283
+ with suppress_stdout_stderr():
284
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
285
+ ydl.download([url])
286
+
287
+ return self._find_audio_file(temp_dir, start_time)
288
+
289
+ def _ytdlp_strategy_with_headers(self, url, start_time):
290
+ """yt-dlp with browser-like headers"""
291
+ temp_dir = tempfile.mkdtemp()
292
+ ydl_opts = {
293
+ 'format': 'bestaudio[abr<=64]/worst',
294
+ 'postprocessors': [{
295
+ 'key': 'FFmpegExtractAudio',
296
+ 'preferredcodec': 'wav',
297
+ 'preferredquality': '192',
298
+ }],
299
+ 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
300
+ 'quiet': True,
301
+ 'no_warnings': True,
302
+ 'noplaylist': True,
303
+ 'http_headers': {
304
+ 'User-Agent': random.choice(self.user_agents),
305
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
306
+ 'Accept-Language': 'en-US,en;q=0.9',
307
+ 'Accept-Encoding': 'gzip, deflate',
308
+ 'Connection': 'keep-alive',
309
+ },
310
+ 'sleep_interval': 1,
311
+ 'max_sleep_interval': 3,
312
+ }
313
+
314
+ with suppress_stdout_stderr():
315
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
316
+ ydl.download([url])
317
+
318
+ return self._find_audio_file(temp_dir, start_time)
319
+
320
+ def _ytdlp_strategy_low_quality(self, url, start_time):
321
+ """yt-dlp with lowest quality to avoid detection"""
322
+ temp_dir = tempfile.mkdtemp()
323
+ ydl_opts = {
324
+ 'format': 'worstaudio/worst',
325
+ 'postprocessors': [{
326
+ 'key': 'FFmpegExtractAudio',
327
+ 'preferredcodec': 'wav',
328
+ 'preferredquality': '128',
329
+ }],
330
+ 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
331
+ 'quiet': True,
332
+ 'no_warnings': True,
333
+ 'noplaylist': True,
334
+ 'sleep_interval': 2,
335
+ 'max_sleep_interval': 5,
336
+ }
337
+
338
+ with suppress_stdout_stderr():
339
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
340
+ ydl.download([url])
341
+
342
+ return self._find_audio_file(temp_dir, start_time)
343
+
344
+ def _ytdlp_strategy_audio_only(self, url, start_time):
345
+ """yt-dlp targeting audio-only streams"""
346
+ temp_dir = tempfile.mkdtemp()
347
+ ydl_opts = {
348
+ 'format': 'bestaudio',
349
+ 'outtmpl': os.path.join(temp_dir, 'audio.%(ext)s'),
350
+ 'postprocessors': [{
351
+ 'key': 'FFmpegExtractAudio',
352
+ 'preferredcodec': 'wav',
353
+ 'preferredquality': '192',
354
+ }],
355
+ 'prefer_ffmpeg': True,
356
+ 'ignoreerrors': True,
357
+ 'quiet': True,
358
+ 'no_warnings': True,
359
+ }
360
+
361
+ with suppress_stdout_stderr():
362
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
363
+ ydl.download([url])
364
+
365
+ return self._find_audio_file(temp_dir, start_time)
366
+
367
+ def _extract_audio_from_video_file(self, video_file, start_time):
368
+ """Extract audio from video file using FFmpeg"""
369
+ temp_dir = tempfile.mkdtemp()
370
+ output_audio = os.path.join(temp_dir, 'extracted_audio.wav')
371
+
372
+ try:
373
+ import subprocess
374
+
375
+ # Use FFmpeg to extract audio
376
+ cmd = [
377
+ 'ffmpeg', '-i', video_file,
378
+ '-vn', # no video
379
+ '-acodec', 'pcm_s16le', # uncompressed WAV
380
+ '-ar', '16000', # 16kHz sample rate
381
+ '-ac', '1', # mono
382
+ '-y', # overwrite output file
383
+ output_audio
384
+ ]
385
+
386
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
387
+
388
+ if result.returncode == 0 and os.path.exists(output_audio):
389
+ end_time = time.time()
390
+ print(f"[⏱️] Audio extraction from video took {end_time - start_time:.2f} seconds.")
391
+ return output_audio
392
+ else:
393
+ raise Exception(f"FFmpeg failed: {result.stderr}")
394
+
395
+ except FileNotFoundError:
396
+ # Fallback to torchaudio if FFmpeg not available
397
+ return self._convert_to_wav(video_file, start_time)
398
+ except Exception as e:
399
+ raise Exception(f"Failed to extract audio from video: {str(e)}")
400
+
401
+ def _convert_to_wav(self, audio_file, start_time):
402
+ """Convert audio file to WAV format"""
403
+ try:
404
+ waveform, sample_rate = torchaudio.load(audio_file)
405
+
406
+ # Convert to mono if needed
407
+ if waveform.shape[0] > 1:
408
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
409
+
410
+ # Resample to 16kHz if needed
411
+ if sample_rate != 16000:
412
+ waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
413
+
414
+ # Save as WAV
415
+ temp_dir = tempfile.mkdtemp()
416
+ output_wav = os.path.join(temp_dir, 'converted_audio.wav')
417
+ torchaudio.save(output_wav, waveform, 16000)
418
+
419
  end_time = time.time()
420
+ print(f"[⏱️] Audio conversion took {end_time - start_time:.2f} seconds.")
421
+ return output_wav
422
+
423
+ except Exception as e:
424
+ raise Exception(f"Failed to convert audio to WAV: {str(e)}")
425
 
426
+ def _find_audio_file(self, directory, start_time):
427
+ """Find the extracted audio file"""
428
+ audio_extensions = ['.wav', '.mp3', '.m4a', '.ogg', '.aac']
429
+
430
+ for file in os.listdir(directory):
431
+ if any(file.lower().endswith(ext) for ext in audio_extensions):
432
+ audio_path = os.path.join(directory, file)
433
+
434
+ # Convert to WAV if not already
435
+ if not file.lower().endswith('.wav'):
436
+ return self._convert_to_wav(audio_path, start_time)
437
+
438
+ end_time = time.time()
439
+ print(f"[⏱️] Audio extraction took {end_time - start_time:.2f} seconds.")
440
+ return audio_path
441
+
442
+ raise Exception("No audio file found after extraction")
443
 
444
+ # Update the main function to use the new extractor
445
+ def extract_audio_from_video_url(video_source):
446
+ """
447
+ Main function that handles all types of video sources:
448
+ - File paths (uploaded files)
449
+ - Direct media URLs
450
+ - Loom URLs
451
+ - Other video platform URLs
452
+ """
453
+ extractor = RobustAudioExtractor()
454
+ return extractor.extract_audio_from_source(video_source)
455
 
456
+ # Keep the existing chunking functions unchanged
457
  def smart_chunk_audio(waveform, sample_rate, duration_minutes):
458
  """Smart chunking based on video duration"""
459
  total_duration = waveform.size(1) / sample_rate
 
534
  print(f"📦 Strategic sampling: {len(chunks)} chunks from long video")
535
  return chunks
536
 
537
+ def prepare_audio(video_source):
538
  """Main function to extract and prepare audio chunks"""
539
  try:
540
+ print(f"🎵 Extracting audio from source...")
541
+ audio_path = extract_audio_from_video_url(video_source)
542
  print(f"✅ Audio extracted to: {audio_path}")
543
 
544
  print(f"🎯 Loading and preparing audio...")
 
557
  end = time.time()
558
  print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
559
 
 
 
 
 
 
 
 
560
  # Calculate duration and apply smart chunking
561
  duration_minutes = waveform.size(1) / sample_rate / 60
562
 
requirements.txt CHANGED
@@ -8,6 +8,5 @@ IPython==7.34.0
8
  ffmpeg-python==0.2.0
9
  validators==0.35.0
10
  streamlit==1.45.1
11
- plotly==6.1.2
12
- pandas==2.2.3
13
- numpy==2.2.6
 
8
  ffmpeg-python==0.2.0
9
  validators==0.35.0
10
  streamlit==1.45.1
11
+
12
+