Spaces:

Lycoris53
/

VITS-TTS-Japanese-Only-Amitaro

Running

App Files Files Community

Lycoris53 commited on Jun 26, 2023

Commit

18893bc

1 Parent(s): 9346eed

Create short_audio_transcribe.py

Browse files

Files changed (1) hide show

short_audio_transcribe.py +91 -0

short_audio_transcribe.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import whisper
+import os
+import json
+import torchaudio
+import argparse
+import torch
+from tqdm import tqdm
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--whisper_size", default="large")
+    args = parser.parse_args()
+    #assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
+    model = whisper.load_model(args.whisper_size, device="cpu")
+    parent_dir = "./custom_character_voice/"
+    speaker_names = list(os.walk(parent_dir))[0][1]
+    speaker_annos = []
+    total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
+    # resample audios
+    # 2023/4/21: Get the target sampling rate
+    with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
+        hps = json.load(f)
+    target_sr = hps['data']['sampling_rate']
+    processed_files = 0
+    for speaker in speaker_names:
+        filelist = (list(os.walk(parent_dir + speaker))[0][2])
+        for i, wavfile in tqdm(enumerate(filelist), desc="Processing Audio:", total=len(filelist)):
+            # try to load file as audio
+            if wavfile.startswith("processed_"):
+                continue
+            #try:
+            wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
+                                      channels_first=True)
+            wav = wav.mean(dim=0).unsqueeze(0)
+            if sr != target_sr:
+                wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
+            if wav.shape[1] / sr > 20:
+                print(f"{wavfile} too long, ignoring\n")
+            save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
+            torchaudio.save(save_path, wav, target_sr, channels_first=True)
+            # transcribe text
+            #lang, text = transcribe_one(save_path)
+            audio = whisper.load_audio(save_path)
+            audio = whisper.pad_or_trim(audio)
+            # make log-Mel spectrogram and move to the same device as the model
+            mel = whisper.log_mel_spectrogram(audio).to(model.device)
+            options = whisper.DecodingOptions(beam_size=5, language="ja", fp16 = False)
+            result = whisper.decode(model, mel, options)
+            text = "[JA]"+ result.text + "[JA]\n"
+            speaker_annos.append(save_path + "|" + speaker + "|" + text)
+            processed_files += 1
+            #print(f"Processed: {processed_files}/{total_files}")
+            #except:
+            #    print(f"Error occurred: {wavfile}")
+            #    continue
+    # # clean annotation
+    # import argparse
+    # import text
+    # from utils import load_filepaths_and_text
+    # for i, line in enumerate(speaker_annos):
+    #     path, sid, txt = line.split("|")
+    #     cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
+    #     cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
+    #     speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
+    # write into annotation
+    if len(speaker_annos) == 0:
+        print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
+        print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
+    with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
+        for line in speaker_annos:
+            f.write(line)
+    # import json
+    # # generate new config
+    # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
+    #     hps = json.load(f)
+    # # modify n_speakers
+    # hps['data']["n_speakers"] = 1000 + len(speaker2id)
+    # # add speaker names
+    # for speaker in speaker_names:
+    #     hps['speakers'][speaker] = speaker2id[speaker]
+    # # save modified config
+    # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
+    #     json.dump(hps, f, indent=2)
+    # print("finished")