Spaces:
Sleeping
Sleeping
add smaller chuncks
Browse files- audio_extractor.py +64 -13
audio_extractor.py
CHANGED
@@ -254,25 +254,63 @@ class SimpleAudioExtractor:
|
|
254 |
except Exception as e:
|
255 |
raise Exception(f"Failed to convert audio to WAV: {str(e)}")
|
256 |
|
257 |
-
def
|
258 |
-
"""Create
|
259 |
-
chunk_length_sec = 60 # 1 minute chunks
|
260 |
-
chunk_samples = chunk_length_sec * sample_rate
|
261 |
total_samples = waveform.size(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
chunks = []
|
|
|
263 |
|
264 |
for start in range(0, total_samples, chunk_samples):
|
265 |
end = min(start + chunk_samples, total_samples)
|
266 |
chunk = waveform[:, start:end]
|
267 |
-
|
268 |
-
if
|
|
|
269 |
chunks.append(chunk)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
-
print(f"π¦ Created {len(chunks)}
|
272 |
return chunks
|
273 |
|
274 |
-
def prepare_audio(video_source):
|
275 |
-
"""Main function to extract and prepare
|
276 |
try:
|
277 |
print(f"π΅ Extracting audio from source...")
|
278 |
extractor = SimpleAudioExtractor()
|
@@ -295,21 +333,34 @@ def prepare_audio(video_source):
|
|
295 |
end = time.time()
|
296 |
print(f"[β±οΈ] Audio preparation took {end - start:.2f} seconds.")
|
297 |
|
298 |
-
# Calculate duration
|
299 |
duration_minutes = waveform.size(1) / sample_rate / 60
|
300 |
|
301 |
-
print(f"π§© Creating
|
302 |
start = time.time()
|
303 |
-
chunks =
|
304 |
end = time.time()
|
305 |
print(f"[β±οΈ] Chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
|
306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
307 |
return {
|
308 |
"success": True,
|
309 |
"chunks": chunks,
|
310 |
"audio_path": audio_path,
|
311 |
"duration_minutes": duration_minutes,
|
312 |
-
"total_chunks": len(chunks)
|
|
|
|
|
313 |
}
|
314 |
|
315 |
except Exception as e:
|
|
|
254 |
except Exception as e:
|
255 |
raise Exception(f"Failed to convert audio to WAV: {str(e)}")
|
256 |
|
257 |
+
def chunk_audio_adaptive(waveform, sample_rate, min_chunk_sec=2):
|
258 |
+
"""Create adaptive chunks from audio, handling small voices and short audio"""
|
|
|
|
|
259 |
total_samples = waveform.size(1)
|
260 |
+
duration_sec = total_samples / sample_rate
|
261 |
+
|
262 |
+
print(f"π΅ Audio duration: {duration_sec:.2f} seconds ({duration_sec/60:.2f} minutes)")
|
263 |
+
|
264 |
+
# For very short audio (less than 5 seconds), return as single chunk
|
265 |
+
if duration_sec <= 5:
|
266 |
+
print(f"π¦ Audio is very short ({duration_sec:.2f}s), keeping as single chunk")
|
267 |
+
return [waveform]
|
268 |
+
|
269 |
+
# For short audio (5-30 seconds), create smaller chunks
|
270 |
+
elif duration_sec <= 30:
|
271 |
+
chunk_length_sec = max(min_chunk_sec, duration_sec / 3) # Split into ~3 chunks
|
272 |
+
print(f"π¦ Short audio detected, using {chunk_length_sec:.1f}s chunks")
|
273 |
+
|
274 |
+
# For medium audio (30s-2min), use 30-second chunks
|
275 |
+
elif duration_sec <= 120:
|
276 |
+
chunk_length_sec = 30
|
277 |
+
print(f"π¦ Medium audio detected, using {chunk_length_sec}s chunks")
|
278 |
+
|
279 |
+
# For long audio (>2min), use 1-minute chunks
|
280 |
+
else:
|
281 |
+
chunk_length_sec = 60
|
282 |
+
print(f"π¦ Long audio detected, using {chunk_length_sec}s chunks")
|
283 |
+
|
284 |
+
chunk_samples = int(chunk_length_sec * sample_rate)
|
285 |
chunks = []
|
286 |
+
min_samples = int(min_chunk_sec * sample_rate) # Minimum chunk size
|
287 |
|
288 |
for start in range(0, total_samples, chunk_samples):
|
289 |
end = min(start + chunk_samples, total_samples)
|
290 |
chunk = waveform[:, start:end]
|
291 |
+
|
292 |
+
# Include chunk if it meets minimum size OR if it's the last chunk and we have no chunks yet
|
293 |
+
if chunk.size(1) >= min_samples or (len(chunks) == 0 and start + chunk_samples >= total_samples):
|
294 |
chunks.append(chunk)
|
295 |
+
chunk_dur = chunk.size(1) / sample_rate
|
296 |
+
print(f" β Chunk {len(chunks)}: {chunk_dur:.2f}s")
|
297 |
+
else:
|
298 |
+
# If chunk is too small, merge it with the previous chunk if possible
|
299 |
+
if chunks:
|
300 |
+
print(f" π Merging small chunk ({chunk.size(1) / sample_rate:.2f}s) with previous")
|
301 |
+
chunks[-1] = torch.cat([chunks[-1], chunk], dim=1)
|
302 |
+
merged_dur = chunks[-1].size(1) / sample_rate
|
303 |
+
print(f" β Merged chunk {len(chunks)}: {merged_dur:.2f}s")
|
304 |
+
else:
|
305 |
+
# If no previous chunks, keep it anyway (better than losing audio)
|
306 |
+
chunks.append(chunk)
|
307 |
+
print(f" β οΈ Keeping small chunk {len(chunks)}: {chunk.size(1) / sample_rate:.2f}s (no other chunks)")
|
308 |
|
309 |
+
print(f"π¦ Created {len(chunks)} adaptive chunks")
|
310 |
return chunks
|
311 |
|
312 |
+
def prepare_audio(video_source, min_chunk_seconds=2):
|
313 |
+
"""Main function to extract and prepare adaptive audio chunks for small voices"""
|
314 |
try:
|
315 |
print(f"π΅ Extracting audio from source...")
|
316 |
extractor = SimpleAudioExtractor()
|
|
|
333 |
end = time.time()
|
334 |
print(f"[β±οΈ] Audio preparation took {end - start:.2f} seconds.")
|
335 |
|
336 |
+
# Calculate duration
|
337 |
duration_minutes = waveform.size(1) / sample_rate / 60
|
338 |
|
339 |
+
print(f"π§© Creating adaptive chunks (min {min_chunk_seconds}s)...")
|
340 |
start = time.time()
|
341 |
+
chunks = chunk_audio_adaptive(waveform, sample_rate, min_chunk_seconds)
|
342 |
end = time.time()
|
343 |
print(f"[β±οΈ] Chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
|
344 |
|
345 |
+
# Log chunk details
|
346 |
+
print("π Chunk Summary:")
|
347 |
+
total_chunk_duration = 0
|
348 |
+
for i, chunk in enumerate(chunks, 1):
|
349 |
+
chunk_duration = chunk.size(1) / sample_rate
|
350 |
+
total_chunk_duration += chunk_duration
|
351 |
+
print(f" Chunk {i}: {chunk_duration:.2f}s")
|
352 |
+
|
353 |
+
print(f" Total chunked duration: {total_chunk_duration:.2f}s")
|
354 |
+
print(f" Original duration: {duration_minutes * 60:.2f}s")
|
355 |
+
|
356 |
return {
|
357 |
"success": True,
|
358 |
"chunks": chunks,
|
359 |
"audio_path": audio_path,
|
360 |
"duration_minutes": duration_minutes,
|
361 |
+
"total_chunks": len(chunks),
|
362 |
+
"chunk_details": [{"chunk_id": i+1, "duration_seconds": chunk.size(1) / sample_rate}
|
363 |
+
for i, chunk in enumerate(chunks)]
|
364 |
}
|
365 |
|
366 |
except Exception as e:
|