Spaces:

cdactvm
/

Tamil_ASR_Demo

Sleeping

File size: 2,996 Bytes

# import torch
# import torchaudio
# from silero_vad import get_speech_timestamps, read_audio, save_audio

# def apply_silero_vad(audio_file_path):
#     """
#     Applies Silero VAD to an audio file and returns the processed audio
#     containing only the voiced segments.
#     """
#     # Load the Silero VAD model
#     model = torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=True)
    
#     # Define helper utilities manually
#     def read_audio(path, sampling_rate=16000):
#         wav, sr = torchaudio.load(path)
#         if sr != sampling_rate:
#             wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sampling_rate)(wav)
#         return wav.squeeze(0)

#     def save_audio(path, tensor, sampling_rate=16000):
#         torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)

#     # Read the audio file
#     wav = read_audio(audio_file_path, sampling_rate=16000)

#     # Get timestamps for speech segments
#     speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=16000)

#     # If no speech detected, raise an exception
#     if not speech_timestamps:
#         raise Exception("No voiced frames detected using Silero VAD.")

#     # Combine the voiced segments
#     voiced_audio = torch.cat([wav[ts['start']:ts['end']] for ts in speech_timestamps])

#     # Save the processed audio if needed
#     save_audio('processed_voiced_audio.wav', voiced_audio, sampling_rate=16000)

#     # Convert to numpy bytes for further processing
#     return voiced_audio.numpy().tobytes()

# # Example usage
# try:
#     processed_audio = apply_silero_vad("path_to_your_audio.wav")
#     print("VAD completed successfully!")
# except Exception as e:
#     print(f"Error during Silero VAD processing: {e}")


import webrtcvad
import numpy as np
import librosa

def apply_vad(audio, sr, frame_duration=30, aggressiveness=3):
    '''
    Voice Activity Detection (VAD): Detects speech in audio.
    '''
    vad = webrtcvad.Vad(aggressiveness)
    
    # Resample to 16000 Hz if not already (recommended for better compatibility)
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Convert to 16-bit PCM format expected by webrtcvad
    audio_int16 = np.int16(audio * 32767)
    
    # Ensure frame size matches WebRTC's expected lengths
    frame_size = int(sr * frame_duration / 1000)
    if frame_size % 2 != 0:
        frame_size -= 1  # Make sure it's even to avoid processing issues
    
    frames = [audio_int16[i:i + frame_size] for i in range(0, len(audio_int16), frame_size)]
    
    # Filter out non-speech frames
    voiced_frames = []
    for frame in frames:
        if len(frame) == frame_size and vad.is_speech(frame.tobytes(), sample_rate=sr):
            voiced_frames.append(frame)
    
    # Concatenate the voiced frames
    voiced_audio = np.concatenate(voiced_frames)
    voiced_audio = np.float32(voiced_audio) / 32767
    
    return voiced_audio