import os
import re
import argparse
from glob import glob
from pathlib import Path
from subprocess import call

def main():
    my_parser = argparse.ArgumentParser()
    my_parser.add_argument('--google-path',
                       metavar='path',
                       type=str,
                       help='the path to tsv file')
    my_parser.add_argument('--festcat-path',
                       metavar='path',
                       type=str,
                       help='the path to wavs file')
    #my_parser.add_argument('--cv-path',
    #                   metavar='path',
    #                   type=str,
    #                   help='the path to wavs file')
    my_parser.add_argument('--final-path',
                       metavar='path',
                       type=str,
                       help='the path to wavs file')
    args = my_parser.parse_args()
    google_path = args.google_path
    festcat_path = args.festcat_path
    #common_voice_path = args.cv_path
    target_base_path = args.final_path

    google_tts_male = google_path + "/male/"
    google_tts_female = google_path + "/female/"
    google_tts_paths = [google_tts_male, google_tts_female]

    #google_tts_paths = ["/gpfs/scratch/bsc88/bsc88858/google_tts/male/","/gpfs/scratch/bsc88/bsc88858/google_tts/female/"]
    #festcat_path = "/gpfs/scratch/bsc88/bsc88858/festcat/"
    #common_voice_path = "/gpfs/scratch/bsc88/bsc88858/cv-corpus-9.0-2022-04-27/ca/"
    #target_base_path = "/gpfs/scratch/bsc88/bsc88474/data/multispeaker_ca/"

    if os.path.exists(google_path):
        print("Converting google_tts data to vctk format")
        convert_google(google_tts_paths, target_base_path)
    else:
        print("Google_tts processed data not found")

    if os.path.exists(festcat_path):
        print("Converting festcat data to vctk format")
        convert_festcat(festcat_path, target_base_path)
    else:
        print("Festcat processed data not found")

    #convert_cv(common_voice_path, target_base_path)

def convert_google(google_tts_paths, target_base_path):
    for g_path in google_tts_paths[:1]:
        meta_files = glob(f"{g_path}/*_*.txt")
        for meta_file in meta_files:
            print(meta_file)
            for line in open(meta_file).readlines():
                text_id, text = line.strip().split('|')
                text.replace('¿','')
                text.replace('¡','')
                #speaker_id =  '_'.join(text_id.split('_')[:2])
                speaker_id = text_id.split('_')[1]
                target_text_file = os.path.join(target_base_path, 'txt',
                                                speaker_id, text_id+'.txt')
                target_wav_file = os.path.join(target_base_path, 'wav',
                                               speaker_id, text_id+'.wav')
                source_wav_file = os.path.join(g_path, 'wavs', text_id+'.wav')

                speaker_paths = [os.path.dirname(target_text_file),
                                 os.path.dirname(target_wav_file)]

                convert_meta(target_text_file, target_wav_file,
                             source_wav_file, speaker_paths, text)

def convert_meta(target_text_file,
                 target_wav_file,
                 source_wav_file,
                 speaker_paths, text):

                # create directories
                for speaker_path in speaker_paths:
                    if not os.path.isdir(speaker_path):
                        os.mkdir(speaker_path)

                # write text file
                with open(target_text_file, 'w') as out:
                    out.write(text)

                # copy wav file
                try:
                    os.path.isfile(source_wav_file)
                except:
                    raise IOError('{} does not exist'.format(source_wav_file))
                
                cp_args = ['cp', source_wav_file, target_wav_file]
                if not os.path.isfile(target_wav_file):
                    #print(' '.join(cp_args))
                    call(cp_args)

def convert_festcat(festcat_path, target_base_path):
    meta_files = glob(f"{festcat_path}/*/*_train.txt")
    for meta_file in meta_files:
        speaker_name = meta_file.split(os.sep)[-2]
        print(meta_file)
        for line in open(meta_file).readlines():
            if '[' not in line:
                text_id, text = line.strip().split('|')
                text.replace('¿','')
                text.replace('¡','')
                #speaker_id =  '_'.join(text_id.split('_')[:3])
                speaker_id = speaker_name
                target_text_file = os.path.join(target_base_path, 'txt',
                                                speaker_id, text_id+'.txt')
                target_wav_file = os.path.join(target_base_path, 'wav',
                                               speaker_id, text_id+'.wav')
                source_wav_file = os.path.join(festcat_path, speaker_name,
                                               'wavs', text_id+'.wav')

                speaker_paths = [os.path.dirname(target_text_file),
                                 os.path.dirname(target_wav_file)]

                convert_meta(target_text_file, target_wav_file,
                             source_wav_file, speaker_paths, text)
            else:
                print('line: {} skipped'.format(line))

def convert_cv(common_voice_path, target_base_path):
    meta_files = glob(f"{common_voice_path}/*.txt")
    for meta_file in meta_files:
        print(meta_file)
        speaker_id = meta_file.split(os.sep)[-1].replace("ca_","").replace(".txt","")
        for line in open(meta_file).readlines():
            text_id, text = line.strip().split('|')

            target_text_file = os.path.join(target_base_path, 'txt',
                                            speaker_id, text_id+'.txt')
            target_wav_file = os.path.join(target_base_path, 'wav',
                                           speaker_id, text_id+'.wav')
            source_wav_file = os.path.join(common_voice_path,
                                           'wavs', text_id+'.wav')

            speaker_paths = [os.path.dirname(target_text_file),
                             os.path.dirname(target_wav_file)]

            convert_meta(target_text_file, target_wav_file,
                         source_wav_file, speaker_paths, text)

if __name__ == "__main__":
    main()