Lycoris53 commited on
Commit
18893bc
·
1 Parent(s): 9346eed

Create short_audio_transcribe.py

Browse files
Files changed (1) hide show
  1. short_audio_transcribe.py +91 -0
short_audio_transcribe.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import os
3
+ import json
4
+ import torchaudio
5
+ import argparse
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+ if __name__ == "__main__":
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--whisper_size", default="large")
12
+ args = parser.parse_args()
13
+ #assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
14
+ model = whisper.load_model(args.whisper_size, device="cpu")
15
+ parent_dir = "./custom_character_voice/"
16
+ speaker_names = list(os.walk(parent_dir))[0][1]
17
+ speaker_annos = []
18
+ total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
19
+ # resample audios
20
+ # 2023/4/21: Get the target sampling rate
21
+ with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
22
+ hps = json.load(f)
23
+ target_sr = hps['data']['sampling_rate']
24
+ processed_files = 0
25
+ for speaker in speaker_names:
26
+ filelist = (list(os.walk(parent_dir + speaker))[0][2])
27
+ for i, wavfile in tqdm(enumerate(filelist), desc="Processing Audio:", total=len(filelist)):
28
+ # try to load file as audio
29
+ if wavfile.startswith("processed_"):
30
+ continue
31
+ #try:
32
+ wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
33
+ channels_first=True)
34
+ wav = wav.mean(dim=0).unsqueeze(0)
35
+ if sr != target_sr:
36
+ wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
37
+ if wav.shape[1] / sr > 20:
38
+ print(f"{wavfile} too long, ignoring\n")
39
+ save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
40
+ torchaudio.save(save_path, wav, target_sr, channels_first=True)
41
+ # transcribe text
42
+ #lang, text = transcribe_one(save_path)
43
+
44
+ audio = whisper.load_audio(save_path)
45
+ audio = whisper.pad_or_trim(audio)
46
+
47
+ # make log-Mel spectrogram and move to the same device as the model
48
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
49
+
50
+ options = whisper.DecodingOptions(beam_size=5, language="ja", fp16 = False)
51
+ result = whisper.decode(model, mel, options)
52
+
53
+ text = "[JA]"+ result.text + "[JA]\n"
54
+ speaker_annos.append(save_path + "|" + speaker + "|" + text)
55
+
56
+ processed_files += 1
57
+ #print(f"Processed: {processed_files}/{total_files}")
58
+ #except:
59
+ # print(f"Error occurred: {wavfile}")
60
+ # continue
61
+
62
+ # # clean annotation
63
+ # import argparse
64
+ # import text
65
+ # from utils import load_filepaths_and_text
66
+ # for i, line in enumerate(speaker_annos):
67
+ # path, sid, txt = line.split("|")
68
+ # cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
69
+ # cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
70
+ # speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
71
+ # write into annotation
72
+ if len(speaker_annos) == 0:
73
+ print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
74
+ print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
75
+ with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
76
+ for line in speaker_annos:
77
+ f.write(line)
78
+
79
+ # import json
80
+ # # generate new config
81
+ # with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
82
+ # hps = json.load(f)
83
+ # # modify n_speakers
84
+ # hps['data']["n_speakers"] = 1000 + len(speaker2id)
85
+ # # add speaker names
86
+ # for speaker in speaker_names:
87
+ # hps['speakers'][speaker] = speaker2id[speaker]
88
+ # # save modified config
89
+ # with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
90
+ # json.dump(hps, f, indent=2)
91
+ # print("finished")