import os import torch import h5py import random import numpy as np from tqdm import tqdm from sentence_transformers import SentenceTransformer import librosa from bigvgan_v2_22khz_80band_256x.meldataset import get_mel_spectrogram from types import SimpleNamespace from torch import nn from einops import rearrange import json import argparse device = 'cuda' if torch.cuda.is_available() else 'cpu' # Load SentenceTransformer model sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2') class AudioToMel_bigvgan(nn.Module): def __init__(self, config_path): super().__init__() # Load configuration file with open(config_path, 'r') as f: self.h = json.load(f, object_hook=lambda d: SimpleNamespace(**d)) def __call__(self, audio): x = self.audio_to_mel(audio) # Extract mel spectrogram return x def audio_to_mel(self, audio): # Convert to mono channel audio = audio[:, 0, :] # Assuming input is (b, c, t), take first channel audio = torch.tensor(audio) # Extract mel spectrogram x = get_mel_spectrogram( wav=audio[:, :], h=self.h ) # Shape: (b, f, t) return x # Initialize BigVGAN Mel extraction model audio_to_mel_model = None # Placeholder, will be initialized later def extract_mel_features(audio_path, sr=24000): """ Extract Mel features using BigVGAN model, with normalization. :param audio_path: Path to the audio file :param sr: Sampling rate (default 24000) :return: Mel spectrogram """ # Load and normalize audio wav, _ = librosa.load(audio_path, sr=sr) max_val = np.max(np.abs(wav)) if max_val > 1.0: wav = wav / max_val wav_tensor = torch.FloatTensor(wav).unsqueeze(0).unsqueeze(0).to(device) # Shape: (1, 1, T) # Extract Mel spectrogram mel_spectrogram = audio_to_mel_model(wav_tensor).cpu().numpy() return mel_spectrogram def get_embedding_from_folder_name(folder_name): """ Convert folder name into embedding using SentenceTransformer. :param folder_name: Name of the folder :return: Corresponding embedding """ try: embedding = sentence_model.encode([folder_name]) return embedding except Exception as e: print(f"Error encoding label for {folder_name}: {e}") return None def process_single_file(file_info): """ Process a single audio file and return its key, mel features, and meta embedding. :param file_info: (root_dir, audio_path) tuple :return: (key, mel_features, embedding) """ root_dir, audio_path = file_info try: # Get file and folder names file_name_with_ext = os.path.basename(audio_path) folder_name = os.path.basename(os.path.dirname(audio_path)) # Extract Mel features mel_features = extract_mel_features(audio_path) # Get embedding from folder name embedding = get_embedding_from_folder_name(folder_name) if embedding is None: return None, None, None key = os.path.relpath(audio_path, root_dir).replace('/', '_').replace('\\', '_') return key, mel_features, embedding except Exception as e: print(f"Error processing {audio_path}: {e}") return None, None, None def process_and_save_files(audio_files, output_h5_file): """ Process audio files and save Mel features and meta embeddings to an HDF5 file. :param audio_files: List of audio file paths :param output_h5_file: Path to the HDF5 output file """ with h5py.File(output_h5_file, 'w') as h5f: for file_info in tqdm(audio_files, desc="Processing audio files"): key, mel_features, embedding = process_single_file(file_info) if key is not None and mel_features is not None and embedding is not None: group = h5f.create_group(key) group.create_dataset('mel', data=mel_features) group.create_dataset('meta', data=embedding) def process_audio_files(root_dir, output_h5_file): """ Walk through a directory and process all audio files, saving them to an HDF5 file. :param root_dir: Root directory containing audio files :param output_h5_file: Path to the HDF5 output file """ audio_files = [] for subdir, _, files in os.walk(root_dir): for file in files: if file.endswith('.wav') or file.endswith('.mp3') or file.endswith('.flac'): audio_path = os.path.join(subdir, file) audio_files.append((root_dir, audio_path)) random.shuffle(audio_files) print(f"Processing {len(audio_files)} files...") process_and_save_files(audio_files, output_h5_file) if __name__ == "__main__": # Argument parser for command line arguments parser = argparse.ArgumentParser(description="Process audio files and extract mel features.") parser.add_argument('--root_dir', type=str, required=True, help='Root directory of the audio files.') parser.add_argument('--output_h5_file', type=str, required=True, help='Output HDF5 file path.') parser.add_argument('--config_path', type=str, required=True, help='Path to the BigVGAN config.json file.') parser.add_argument('--sr', type=int, default=22050, help='Sampling rate (default: 24000).') args = parser.parse_args() # Initialize the BigVGAN Mel extraction model audio_to_mel_model = AudioToMel_bigvgan(args.config_path).to(device) # Process audio files process_audio_files(args.root_dir, args.output_h5_file) print(f"Processing completed. H5 file saved at: {args.output_h5_file}") ### how to use # python process_audio.py --root_dir /path/to/audio/files --output_h5_file /path/to/output.h5 --config_path --config_path bigvgan_v2_22khz_80band_256x/config.json --sr 22050