import os import re import json import argparse import logging import csv import numpy as np logger = logging.getLogger(__name__) def main(): my_parser = argparse.ArgumentParser() my_parser.add_argument('--tsv-path', metavar='path', type=str, help='the path to tsv file') my_parser.add_argument('--wavs-path', metavar='path', type=str, help='the path to wavs file') my_parser.add_argument('--locutors', metavar='N', type=str, help='list of speakers names/id separated with commas') args = my_parser.parse_args() locutors = args.locutors locutors = locutors.replace(" ", ""); locutors = locutors.split(",") tsv_path = args.tsv_path wavs_path = args.wavs_path for locutor in locutors: # get durations durations = get_durations_dict(wavs_path + '%s_sil_stats.csv'%locutor) aggregate_duration = 0 rejected_duration = 0 large_duration = 0 total_duration = 0 tsv_name = "line_index_%s.tsv"%locutor tsv_path = tsv_path + tsv_name tsv_file = open(tsv_path) read_tsv = csv.reader(tsv_file, delimiter="\t") files = [] long_files = [] for row in read_tsv: audio_filename = row[0] + ".wav" #logger.warning(f"Audio_filename {audio_filename}") sentence = row[-1] if sentence: target_path = 'ca_es_%s_22k_sil_pad'%locutor target_path = wavs_path + target_path source_filename = 'ca_es_%s_22k_sil/'%locutor+audio_filename ### source_filename = wavs_path + source_filename #logger.warning(f"source_filename {source_filename}") total_duration += durations[audio_filename] if os.path.isfile(source_filename): if durations[audio_filename] < 10.0: aggregate_duration += durations[audio_filename] files.append((os.path.join(target_path,audio_filename), sentence)) #subprocess.call(['cp',source_filename, target_filename]) else: long_files.append((audio_filename, sentence)) large_duration += durations[audio_filename] else: print(audio_filename) else: rejected_duration += durations[audio_filename] speakers_id = find_speakers_id(wavs_path + '%s_sil_stats.csv'%locutor) for id in speakers_id: speaker_file = files_spliter(files = files, speaker_id = id) if len(speaker_file) == 0: continue else: out(args, speaker_id = id, files = speaker_file) #print(f"mv {wavs_path}ca_{id}_test.txt {wavs_path}{locutor}") #os.system(f"mv {wavs_path}ca_{id}_test.txt {wavs_path}{locutor}") #os.system(f"mv {wavs_path}ca_{id}_val.txt {wavs_path}{locutor}") #os.system(f"mv {wavs_path}ca_{id}_train.txt {wavs_path}{locutor}") #out(args, locutor, files) out_long(args, locutor, long_files) out_long_json(args, locutor, long_files) print(locutor, aggregate_duration/3600, 'hours') print(locutor, 'rejected due to duration', large_duration/3600, 'hours') print(locutor, 'rejected', rejected_duration/60, 'minutes') print(locutor, total_duration, aggregate_duration+rejected_duration+large_duration) def get_durations_dict(filename): durations = {} for line in open(filename).readlines(): d = line.split(',') durations[d[0].split('/')[-1]] = float(d[1]) return durations def get_sentence(filename): utt_all = open(filename, encoding = "ISO-8859-1").read() m = re.search('(\"\\\\\")(.+)(\\\\\"\")', utt_all) sentence = m.groups()[1] # delete interword dashes sentence = re.sub('-(?=([A-Z]))', ' ', sentence) if not re.search('\d', sentence): return sentence else: print(filename, sentence) return None def out(args, speaker_id, files): outname_length = [('ca_%s_test.txt'%speaker_id,0), ('ca_%s_val.txt'%speaker_id,0), ('ca_%s_train.txt'%speaker_id,len(files))] l_sum = sum([el[1] for el in outname_length]) if len(files) != l_sum: msg = 'train vs test val distribution wrong: %i'%l_sum raise ValueError('msg') for fout, l in outname_length: open((args.wavs_path + fout), mode= 'a').close() with open((args.wavs_path + fout), 'w') as out: for i in range(l): f, sentence = files.pop() out.write('%s|%s\n'%(f.split("/")[-1].split(".")[-2],sentence)) print(len(files)) def out_long(args, locutor, files): outname = '%s_longsentences.csv'%locutor outname_path = args.wavs_path + outname open(outname_path, mode= 'a').close() with open(outname_path, 'w') as out: for audio, text in files: out.write('%s,"%s"\n'%(audio, text)) def out_long_json(args, locutor, files): outname = '%s_longsentences.json'%locutor source = args.wavs_path +'ca_es_%s_22k_sil/'%locutor outname_path = args.wavs_path + outname open(outname_path, mode= 'a').close() interventions = [] for audio, text in files: intervention = {} intervention['text'] = [(locutor, text)] intervention['urls'] = [(locutor, os.path.join(source,audio))] interventions.append(intervention) with open(outname_path, 'w') as out: json.dump({'session': interventions}, out, indent=2) def find_speakers_id(path_tsv): durations = {} for line in open(path_tsv).readlines(): d = line.split(',') durations[d[0].split('/')[-1]] = float(d[1]) keysList = list(durations.keys()) for index in range(len(keysList)): keysList[index] = keysList[index].split("_")[1] keysList = np.ndarray.tolist(np.unique(np.array(keysList))) return keysList def files_spliter(files, speaker_id): out_file = [] for element in files: if element[0].split("/")[-1].split("_")[1] == speaker_id: out_file.append(element) return out_file if __name__ == "__main__": main()