Respair
/

XCodec2_24khz

Model card Files Files and versions Community

XCodec2_24khz / get_tsv.py

Respair's picture

Upload folder using huggingface_hub

59b7eeb verified 21 days ago

history blame contribute delete

3.3 kB

	from multiprocessing import Pool, cpu_count
	import os
	import torchaudio
	import torch
	from tqdm import tqdm

	def process_file(args):
	"""
	Processes a single audio file to check for NaN values and get its length.
	Args:
	args (tuple): A tuple containing the file path and the root directory.
	Returns:
	str or None: A formatted string with the ABSOLUTE path and number of samples,
	or None if the file is empty, contains NaN, or causes an error.
	"""
	file_path, root_dir = args # Unpack the arguments
	try:

	abs_path = os.path.abspath(file_path)
	waveform, sample_rate = torchaudio.load(file_path)

	if waveform.numel() == 0:
	return None

	flat_waveform = waveform.reshape(-1)
	batch_size = 10000

	for start in range(0, flat_waveform.numel(), batch_size):
	end = min(start + batch_size, flat_waveform.numel())
	if torch.isnan(flat_waveform[start:end]).any():
	print(f"NaN found in: {abs_path}")
	return None

	nsample = waveform.shape[1]
	return f"{abs_path}\t{nsample}\n"

	except Exception as e:
	print(f"Error processing {file_path}: {e}")
	return None

	def list_audio_files(root_dir, output_file, exclude_dirs=None):
	"""
	Lists audio files in a directory, processes them in parallel to get their
	lengths, and writes the results to a file with ABSOLUTE paths.
	Args:
	root_dir (str): The root directory to search for audio files.
	output_file (str): The path to the output file.
	exclude_dirs (list, optional): A list of directories to exclude. Defaults to None.
	"""
	if exclude_dirs is None:
	exclude_dirs = []

	exclude_dirs = [os.path.abspath(d) for d in exclude_dirs]

	audio_files = []
	print("Finding audio files...")

	for root, dirs, files in os.walk(root_dir, topdown=True):
	# Exclude specified subdirectories
	dirs[:] = [d for d in dirs if os.path.abspath(os.path.join(root, d)) not in exclude_dirs]

	for filename in files:
	if filename.lower().endswith(('.wav', '.flac', '.mp3')):
	file_path = os.path.join(root, filename)
	audio_files.append((file_path, root_dir))


	audio_files.sort(key=lambda x: x[0])
	print(f"Found {len(audio_files)} audio files to process.")


	num_processes = max(1, int(cpu_count() / 2))
	print(f"Starting processing with {num_processes} processes...")

	with Pool(processes=num_processes) as pool:
	results = list(tqdm(pool.imap(process_file, audio_files),
	total=len(audio_files),
	desc="Processing audio files"))

	print(f"Writing results to {output_file}...")
	with open(output_file, 'w', encoding='utf-8') as file:

	for result in results:
	if result:
	file.write(result)

	print("Processing complete.")


	root_directory = '/home/ubuntu/respair/test_wav'
	output_tsv = '/home/ubuntu/X-Codec-2.0/audio_high_quality_TEST.txt'
	exclude_folders = ['']

	list_audio_files(root_directory, output_tsv, exclude_dirs=exclude_folders)