Spaces:
Sleeping
Sleeping
ignore cache
Browse files- .gitignore +3 -0
- chunck_time.py +0 -261
.gitignore
CHANGED
@@ -106,3 +106,6 @@ env/
|
|
106 |
# Misc
|
107 |
*.bak
|
108 |
*.swp
|
|
|
|
|
|
|
|
106 |
# Misc
|
107 |
*.bak
|
108 |
*.swp
|
109 |
+
|
110 |
+
chunk_time.py
|
111 |
+
analyze.txt
|
chunck_time.py
DELETED
@@ -1,261 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import sys
|
3 |
-
import warnings
|
4 |
-
import time
|
5 |
-
import statistics
|
6 |
-
from collections import Counter
|
7 |
-
|
8 |
-
import torch
|
9 |
-
import torchaudio
|
10 |
-
from speechbrain.inference.classifiers import EncoderClassifier
|
11 |
-
|
12 |
-
from audio_extractor import extract_audio_from_video_url
|
13 |
-
|
14 |
-
warnings.filterwarnings("ignore")
|
15 |
-
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
|
16 |
-
|
17 |
-
def create_chunks_by_size(waveform, sample_rate, chunk_length_sec):
|
18 |
-
"""Create chunks of specific size"""
|
19 |
-
chunk_samples = chunk_length_sec * sample_rate
|
20 |
-
total_samples = waveform.size(1)
|
21 |
-
chunks = []
|
22 |
-
|
23 |
-
for start in range(0, total_samples, chunk_samples):
|
24 |
-
end = min(start + chunk_samples, total_samples)
|
25 |
-
chunk = waveform[:, start:end]
|
26 |
-
if chunk.size(1) > sample_rate * 2: # minimum 2 seconds
|
27 |
-
chunks.append(chunk)
|
28 |
-
return chunks
|
29 |
-
|
30 |
-
def predict_chunks_timing(chunks, classifier):
|
31 |
-
"""Time the prediction process for chunks"""
|
32 |
-
if not chunks:
|
33 |
-
return [], 0.0
|
34 |
-
|
35 |
-
start_time = time.time()
|
36 |
-
|
37 |
-
# Pad to same length
|
38 |
-
max_len = max(chunk.size(1) for chunk in chunks)
|
39 |
-
padded_chunks = [torch.nn.functional.pad(chunk, (0, max_len - chunk.size(1))) for chunk in chunks]
|
40 |
-
batch = torch.cat(padded_chunks, dim=0).unsqueeze(1)
|
41 |
-
batch = batch.squeeze(1)
|
42 |
-
|
43 |
-
out_prob, score, index, text_lab = classifier.classify_batch(batch)
|
44 |
-
|
45 |
-
end_time = time.time()
|
46 |
-
prediction_time = end_time - start_time
|
47 |
-
|
48 |
-
results = []
|
49 |
-
for i in range(len(chunks)):
|
50 |
-
results.append({
|
51 |
-
"accent": text_lab[i],
|
52 |
-
"confidence": score[i].item(),
|
53 |
-
})
|
54 |
-
|
55 |
-
return results, prediction_time
|
56 |
-
|
57 |
-
def analyze_chunk_size_performance(video_url, chunk_sizes=[10, 15, 20, 30, 60]):
|
58 |
-
"""Analyze performance for different chunk sizes"""
|
59 |
-
print("🔍 Starting Chunk Size Performance Analysis")
|
60 |
-
print("=" * 60)
|
61 |
-
|
62 |
-
# Extract and prepare audio once
|
63 |
-
print("🎵 Extracting and preparing audio...")
|
64 |
-
audio_start = time.time()
|
65 |
-
|
66 |
-
audio_path = extract_audio_from_video_url(video_url)
|
67 |
-
waveform, sample_rate = torchaudio.load(audio_path)
|
68 |
-
|
69 |
-
if sample_rate != 16000:
|
70 |
-
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
|
71 |
-
sample_rate = 16000
|
72 |
-
|
73 |
-
if waveform.shape[0] > 1:
|
74 |
-
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
75 |
-
|
76 |
-
# # Apply VAD
|
77 |
-
# waveform = simple_vad(waveform, sample_rate)
|
78 |
-
|
79 |
-
audio_end = time.time()
|
80 |
-
audio_prep_time = audio_end - audio_start
|
81 |
-
|
82 |
-
duration_minutes = waveform.size(1) / sample_rate / 60
|
83 |
-
print(f"✅ Audio prepared in {audio_prep_time:.2f}s | Duration: {duration_minutes:.1f} minutes")
|
84 |
-
|
85 |
-
# Load model once
|
86 |
-
print("🧠 Loading model...")
|
87 |
-
model_start = time.time()
|
88 |
-
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa")
|
89 |
-
model_end = time.time()
|
90 |
-
model_load_time = model_end - model_start
|
91 |
-
print(f"✅ Model loaded in {model_load_time:.2f}s")
|
92 |
-
|
93 |
-
print("\n" + "=" * 60)
|
94 |
-
print("📊 CHUNK SIZE ANALYSIS RESULTS")
|
95 |
-
print("=" * 60)
|
96 |
-
|
97 |
-
results = []
|
98 |
-
|
99 |
-
for chunk_size in chunk_sizes:
|
100 |
-
print(f"\n🧩 Testing {chunk_size}-second chunks...")
|
101 |
-
|
102 |
-
# Create chunks
|
103 |
-
chunk_start = time.time()
|
104 |
-
chunks = create_chunks_by_size(waveform, sample_rate, chunk_size)
|
105 |
-
chunk_end = time.time()
|
106 |
-
chunking_time = chunk_end - chunk_start
|
107 |
-
|
108 |
-
if not chunks:
|
109 |
-
print(f"❌ No valid chunks created for {chunk_size}s size")
|
110 |
-
continue
|
111 |
-
|
112 |
-
# Predict
|
113 |
-
predictions, prediction_time = predict_chunks_timing(chunks, classifier)
|
114 |
-
|
115 |
-
# Calculate statistics
|
116 |
-
confidences = [p["confidence"] for p in predictions]
|
117 |
-
accents = [p["accent"] for p in predictions]
|
118 |
-
|
119 |
-
avg_confidence = statistics.mean(confidences) if confidences else 0
|
120 |
-
max_confidence = max(confidences) if confidences else 0
|
121 |
-
min_confidence = min(confidences) if confidences else 0
|
122 |
-
std_confidence = statistics.stdev(confidences) if len(confidences) > 1 else 0
|
123 |
-
|
124 |
-
# Most common accent
|
125 |
-
accent_counts = Counter(accents)
|
126 |
-
most_common_accent = accent_counts.most_common(1)[0] if accent_counts else ("Unknown", 0)
|
127 |
-
|
128 |
-
# Calculate processing rates
|
129 |
-
total_processing_time = chunking_time + prediction_time
|
130 |
-
chunks_per_second = len(chunks) / total_processing_time if total_processing_time > 0 else 0
|
131 |
-
seconds_per_chunk = total_processing_time / len(chunks) if len(chunks) > 0 else 0
|
132 |
-
|
133 |
-
result = {
|
134 |
-
"chunk_size": chunk_size,
|
135 |
-
"num_chunks": len(chunks),
|
136 |
-
"chunking_time": chunking_time,
|
137 |
-
"prediction_time": prediction_time,
|
138 |
-
"total_time": total_processing_time,
|
139 |
-
"avg_confidence": avg_confidence,
|
140 |
-
"max_confidence": max_confidence,
|
141 |
-
"min_confidence": min_confidence,
|
142 |
-
"std_confidence": std_confidence,
|
143 |
-
"most_common_accent": most_common_accent[0],
|
144 |
-
"accent_occurrence": most_common_accent[1],
|
145 |
-
"chunks_per_second": chunks_per_second,
|
146 |
-
"seconds_per_chunk": seconds_per_chunk,
|
147 |
-
"confidence_consistency": 1 - (std_confidence / avg_confidence) if avg_confidence > 0 else 0
|
148 |
-
}
|
149 |
-
|
150 |
-
results.append(result)
|
151 |
-
|
152 |
-
# Print results for this chunk size
|
153 |
-
print(f" 📦 Chunks created: {len(chunks)}")
|
154 |
-
print(f" ⏱️ Chunking time: {chunking_time:.3f}s")
|
155 |
-
print(f" 🧠 Prediction time: {prediction_time:.3f}s")
|
156 |
-
print(f" 🔄 Total processing: {total_processing_time:.3f}s")
|
157 |
-
print(f" ⚡ Processing rate: {chunks_per_second:.1f} chunks/sec")
|
158 |
-
print(f" 📈 Avg confidence: {avg_confidence:.3f}")
|
159 |
-
print(f" 🎯 Most common: {most_common_accent[0]} ({most_common_accent[1]} times)")
|
160 |
-
print(f" 📊 Confidence range: {min_confidence:.3f} - {max_confidence:.3f}")
|
161 |
-
|
162 |
-
# Print summary comparison
|
163 |
-
print("\n" + "=" * 80)
|
164 |
-
print("📈 PERFORMANCE COMPARISON SUMMARY")
|
165 |
-
print("=" * 80)
|
166 |
-
|
167 |
-
if results:
|
168 |
-
print(f"{'Size':<6} {'Chunks':<8} {'Total Time':<12} {'Rate':<12} {'Avg Conf':<10} {'Consistency':<12} {'Winner'}")
|
169 |
-
print("-" * 80)
|
170 |
-
|
171 |
-
for r in results:
|
172 |
-
consistency = f"{r['confidence_consistency']:.2f}"
|
173 |
-
print(f"{r['chunk_size']:<6} {r['num_chunks']:<8} {r['total_time']:<12.3f} {r['chunks_per_second']:<12.1f} {r['avg_confidence']:<10.3f} {consistency:<12} {r['most_common_accent']}")
|
174 |
-
|
175 |
-
# Recommendations
|
176 |
-
print("\n" + "=" * 60)
|
177 |
-
print("🏆 RECOMMENDATIONS")
|
178 |
-
print("=" * 60)
|
179 |
-
|
180 |
-
if results:
|
181 |
-
# Find best for speed
|
182 |
-
fastest = min(results, key=lambda x: x['total_time'])
|
183 |
-
print(f"⚡ Fastest processing: {fastest['chunk_size']}s chunks ({fastest['total_time']:.2f}s total)")
|
184 |
-
|
185 |
-
# Find best for accuracy (highest average confidence)
|
186 |
-
most_accurate = max(results, key=lambda x: x['avg_confidence'])
|
187 |
-
print(f"🎯 Highest accuracy: {most_accurate['chunk_size']}s chunks ({most_accurate['avg_confidence']:.3f} avg confidence)")
|
188 |
-
|
189 |
-
# Find most consistent
|
190 |
-
most_consistent = max(results, key=lambda x: x['confidence_consistency'])
|
191 |
-
print(f"📊 Most consistent: {most_consistent['chunk_size']}s chunks ({most_consistent['confidence_consistency']:.3f} consistency)")
|
192 |
-
|
193 |
-
# Find best balance (speed + accuracy)
|
194 |
-
for r in results:
|
195 |
-
r['balance_score'] = (r['chunks_per_second'] * 0.4) + (r['avg_confidence'] * 100 * 0.6)
|
196 |
-
|
197 |
-
best_balance = max(results, key=lambda x: x['balance_score'])
|
198 |
-
print(f"⚖️ Best balance: {best_balance['chunk_size']}s chunks (score: {best_balance['balance_score']:.1f})")
|
199 |
-
|
200 |
-
return results
|
201 |
-
|
202 |
-
def quick_test_multiple_videos(video_urls, chunk_sizes=[10, 15, 20, 30]):
|
203 |
-
"""Quick test on multiple videos to get average performance"""
|
204 |
-
print("🔍 MULTI-VIDEO CHUNK SIZE ANALYSIS")
|
205 |
-
print("=" * 60)
|
206 |
-
|
207 |
-
all_results = {size: [] for size in chunk_sizes}
|
208 |
-
|
209 |
-
for i, video_url in enumerate(video_urls, 1):
|
210 |
-
print(f"\n📹 Testing Video {i}/{len(video_urls)}")
|
211 |
-
try:
|
212 |
-
video_results = analyze_chunk_size_performance(video_url, chunk_sizes)
|
213 |
-
for result in video_results:
|
214 |
-
all_results[result['chunk_size']].append(result)
|
215 |
-
except Exception as e:
|
216 |
-
print(f"❌ Error with video {i}: {str(e)}")
|
217 |
-
continue
|
218 |
-
|
219 |
-
# Calculate averages
|
220 |
-
print("\n" + "=" * 60)
|
221 |
-
print("📊 AVERAGE PERFORMANCE ACROSS ALL VIDEOS")
|
222 |
-
print("=" * 60)
|
223 |
-
|
224 |
-
avg_results = []
|
225 |
-
for chunk_size in chunk_sizes:
|
226 |
-
if all_results[chunk_size]:
|
227 |
-
results = all_results[chunk_size]
|
228 |
-
avg_result = {
|
229 |
-
'chunk_size': chunk_size,
|
230 |
-
'avg_total_time': statistics.mean([r['total_time'] for r in results]),
|
231 |
-
'avg_chunks_per_sec': statistics.mean([r['chunks_per_second'] for r in results]),
|
232 |
-
'avg_confidence': statistics.mean([r['avg_confidence'] for r in results]),
|
233 |
-
'avg_consistency': statistics.mean([r['confidence_consistency'] for r in results]),
|
234 |
-
'sample_count': len(results)
|
235 |
-
}
|
236 |
-
avg_results.append(avg_result)
|
237 |
-
|
238 |
-
if avg_results:
|
239 |
-
print(f"{'Size':<6} {'Samples':<8} {'Avg Time':<10} {'Avg Rate':<10} {'Avg Conf':<10} {'Consistency'}")
|
240 |
-
print("-" * 60)
|
241 |
-
for r in avg_results:
|
242 |
-
print(f"{r['chunk_size']:<6} {r['sample_count']:<8} {r['avg_total_time']:<10.2f} {r['avg_chunks_per_sec']:<10.1f} {r['avg_confidence']:<10.3f} {r['avg_consistency']:.3f}")
|
243 |
-
|
244 |
-
return avg_results
|
245 |
-
|
246 |
-
if __name__ == "__main__":
|
247 |
-
# Test with single video
|
248 |
-
video_url = "https://www.youtube.com/watch?v=-JTq1BFBwmo&list=PLDN4rrl48XKpZkf03iYFl-O29szjTrs_O&index=2"
|
249 |
-
|
250 |
-
print("🚀 Starting Single Video Analysis...")
|
251 |
-
results = analyze_chunk_size_performance(video_url)
|
252 |
-
|
253 |
-
# Uncomment below to test multiple videos
|
254 |
-
# print("\n" + "="*60)
|
255 |
-
# print("🚀 Starting Multi-Video Analysis...")
|
256 |
-
# video_urls = [
|
257 |
-
# "https://www.youtube.com/watch?v=VIDEO1",
|
258 |
-
# "https://www.youtube.com/watch?v=VIDEO2",
|
259 |
-
# # Add more video URLs here
|
260 |
-
# ]
|
261 |
-
# multi_results = quick_test_multiple_videos(video_urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|