Amr-h commited on
Commit
0c590b4
Β·
1 Parent(s): 5ac9990
Files changed (1) hide show
  1. audio_extractor.py +17 -60
audio_extractor.py CHANGED
@@ -254,63 +254,33 @@ class SimpleAudioExtractor:
254
  except Exception as e:
255
  raise Exception(f"Failed to convert audio to WAV: {str(e)}")
256
 
257
- def chunk_audio_adaptive(waveform, sample_rate, min_chunk_sec=2):
258
- """Create adaptive chunks from audio, handling small voices and short audio"""
259
  total_samples = waveform.size(1)
260
  duration_sec = total_samples / sample_rate
261
 
262
- print(f"🎡 Audio duration: {duration_sec:.2f} seconds ({duration_sec/60:.2f} minutes)")
263
-
264
- # For very short audio (less than 5 seconds), return as single chunk
265
- if duration_sec <= 5:
266
- print(f"πŸ“¦ Audio is very short ({duration_sec:.2f}s), keeping as single chunk")
267
  return [waveform]
268
 
269
- # For short audio (5-30 seconds), create smaller chunks
270
- elif duration_sec <= 30:
271
- chunk_length_sec = max(min_chunk_sec, duration_sec / 3) # Split into ~3 chunks
272
- print(f"πŸ“¦ Short audio detected, using {chunk_length_sec:.1f}s chunks")
273
-
274
- # For medium audio (30s-2min), use 30-second chunks
275
- elif duration_sec <= 120:
276
- chunk_length_sec = 30
277
- print(f"πŸ“¦ Medium audio detected, using {chunk_length_sec}s chunks")
278
-
279
- # For long audio (>2min), use 1-minute chunks
280
- else:
281
- chunk_length_sec = 60
282
- print(f"πŸ“¦ Long audio detected, using {chunk_length_sec}s chunks")
283
-
284
- chunk_samples = int(chunk_length_sec * sample_rate)
285
  chunks = []
286
- min_samples = int(min_chunk_sec * sample_rate) # Minimum chunk size
287
 
288
  for start in range(0, total_samples, chunk_samples):
289
  end = min(start + chunk_samples, total_samples)
290
  chunk = waveform[:, start:end]
291
-
292
- # Include chunk if it meets minimum size OR if it's the last chunk and we have no chunks yet
293
- if chunk.size(1) >= min_samples or (len(chunks) == 0 and start + chunk_samples >= total_samples):
294
  chunks.append(chunk)
295
- chunk_dur = chunk.size(1) / sample_rate
296
- print(f" βœ“ Chunk {len(chunks)}: {chunk_dur:.2f}s")
297
- else:
298
- # If chunk is too small, merge it with the previous chunk if possible
299
- if chunks:
300
- print(f" πŸ“Ž Merging small chunk ({chunk.size(1) / sample_rate:.2f}s) with previous")
301
- chunks[-1] = torch.cat([chunks[-1], chunk], dim=1)
302
- merged_dur = chunks[-1].size(1) / sample_rate
303
- print(f" βœ“ Merged chunk {len(chunks)}: {merged_dur:.2f}s")
304
- else:
305
- # If no previous chunks, keep it anyway (better than losing audio)
306
- chunks.append(chunk)
307
- print(f" ⚠️ Keeping small chunk {len(chunks)}: {chunk.size(1) / sample_rate:.2f}s (no other chunks)")
308
 
309
- print(f"πŸ“¦ Created {len(chunks)} adaptive chunks")
310
  return chunks
311
 
312
- def prepare_audio(video_source, min_chunk_seconds=2):
313
- """Main function to extract and prepare adaptive audio chunks for small voices"""
314
  try:
315
  print(f"🎡 Extracting audio from source...")
316
  extractor = SimpleAudioExtractor()
@@ -333,34 +303,21 @@ def prepare_audio(video_source, min_chunk_seconds=2):
333
  end = time.time()
334
  print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
335
 
336
- # Calculate duration
337
  duration_minutes = waveform.size(1) / sample_rate / 60
338
 
339
- print(f"🧩 Creating adaptive chunks (min {min_chunk_seconds}s)...")
340
  start = time.time()
341
- chunks = chunk_audio_adaptive(waveform, sample_rate, min_chunk_seconds)
342
  end = time.time()
343
  print(f"[⏱️] Chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
344
 
345
- # Log chunk details
346
- print("πŸ“‹ Chunk Summary:")
347
- total_chunk_duration = 0
348
- for i, chunk in enumerate(chunks, 1):
349
- chunk_duration = chunk.size(1) / sample_rate
350
- total_chunk_duration += chunk_duration
351
- print(f" Chunk {i}: {chunk_duration:.2f}s")
352
-
353
- print(f" Total chunked duration: {total_chunk_duration:.2f}s")
354
- print(f" Original duration: {duration_minutes * 60:.2f}s")
355
-
356
  return {
357
  "success": True,
358
  "chunks": chunks,
359
  "audio_path": audio_path,
360
  "duration_minutes": duration_minutes,
361
- "total_chunks": len(chunks),
362
- "chunk_details": [{"chunk_id": i+1, "duration_seconds": chunk.size(1) / sample_rate}
363
- for i, chunk in enumerate(chunks)]
364
  }
365
 
366
  except Exception as e:
 
254
  except Exception as e:
255
  raise Exception(f"Failed to convert audio to WAV: {str(e)}")
256
 
257
+ def chunk_audio_1min(waveform, sample_rate, short_audio_threshold=30):
258
+ """Create 1-minute chunks from audio, handle short audio as single chunk"""
259
  total_samples = waveform.size(1)
260
  duration_sec = total_samples / sample_rate
261
 
262
+ # If audio is short (≀30 seconds by default), return as single chunk
263
+ if duration_sec <= short_audio_threshold:
264
+ print(f"πŸ“¦ Short audio ({duration_sec:.2f}s), keeping as single chunk")
 
 
265
  return [waveform]
266
 
267
+ # For longer audio, use 1-minute chunks
268
+ chunk_length_sec = 60 # 1 minute chunks
269
+ chunk_samples = chunk_length_sec * sample_rate
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  chunks = []
 
271
 
272
  for start in range(0, total_samples, chunk_samples):
273
  end = min(start + chunk_samples, total_samples)
274
  chunk = waveform[:, start:end]
275
+ # Only include chunks that are at least 10 seconds long
276
+ if chunk.size(1) > sample_rate * 10:
 
277
  chunks.append(chunk)
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
+ print(f"πŸ“¦ Created {len(chunks)} 1-minute chunks")
280
  return chunks
281
 
282
+ def prepare_audio(video_source, short_audio_threshold=30):
283
+ """Main function to extract and prepare audio chunks, handling short audio as single segment"""
284
  try:
285
  print(f"🎡 Extracting audio from source...")
286
  extractor = SimpleAudioExtractor()
 
303
  end = time.time()
304
  print(f"[⏱️] Audio preparation took {end - start:.2f} seconds.")
305
 
306
+ # Calculate duration and create chunks
307
  duration_minutes = waveform.size(1) / sample_rate / 60
308
 
309
+ print(f"🧩 Creating chunks (short audio threshold: {short_audio_threshold}s)...")
310
  start = time.time()
311
+ chunks = chunk_audio_1min(waveform, sample_rate, short_audio_threshold)
312
  end = time.time()
313
  print(f"[⏱️] Chunking took {end - start:.2f} seconds. Total chunks: {len(chunks)}")
314
 
 
 
 
 
 
 
 
 
 
 
 
315
  return {
316
  "success": True,
317
  "chunks": chunks,
318
  "audio_path": audio_path,
319
  "duration_minutes": duration_minutes,
320
+ "total_chunks": len(chunks)
 
 
321
  }
322
 
323
  except Exception as e: