Create short_audio_transcribe.py
Browse files- short_audio_transcribe.py +91 -0
short_audio_transcribe.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import torchaudio
|
5 |
+
import argparse
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
if __name__ == "__main__":
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument("--whisper_size", default="large")
|
12 |
+
args = parser.parse_args()
|
13 |
+
#assert (torch.cuda.is_available()), "Please enable GPU in order to run Whisper!"
|
14 |
+
model = whisper.load_model(args.whisper_size, device="cpu")
|
15 |
+
parent_dir = "./custom_character_voice/"
|
16 |
+
speaker_names = list(os.walk(parent_dir))[0][1]
|
17 |
+
speaker_annos = []
|
18 |
+
total_files = sum([len(files) for r, d, files in os.walk(parent_dir)])
|
19 |
+
# resample audios
|
20 |
+
# 2023/4/21: Get the target sampling rate
|
21 |
+
with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
22 |
+
hps = json.load(f)
|
23 |
+
target_sr = hps['data']['sampling_rate']
|
24 |
+
processed_files = 0
|
25 |
+
for speaker in speaker_names:
|
26 |
+
filelist = (list(os.walk(parent_dir + speaker))[0][2])
|
27 |
+
for i, wavfile in tqdm(enumerate(filelist), desc="Processing Audio:", total=len(filelist)):
|
28 |
+
# try to load file as audio
|
29 |
+
if wavfile.startswith("processed_"):
|
30 |
+
continue
|
31 |
+
#try:
|
32 |
+
wav, sr = torchaudio.load(parent_dir + speaker + "/" + wavfile, frame_offset=0, num_frames=-1, normalize=True,
|
33 |
+
channels_first=True)
|
34 |
+
wav = wav.mean(dim=0).unsqueeze(0)
|
35 |
+
if sr != target_sr:
|
36 |
+
wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(wav)
|
37 |
+
if wav.shape[1] / sr > 20:
|
38 |
+
print(f"{wavfile} too long, ignoring\n")
|
39 |
+
save_path = parent_dir + speaker + "/" + f"processed_{i}.wav"
|
40 |
+
torchaudio.save(save_path, wav, target_sr, channels_first=True)
|
41 |
+
# transcribe text
|
42 |
+
#lang, text = transcribe_one(save_path)
|
43 |
+
|
44 |
+
audio = whisper.load_audio(save_path)
|
45 |
+
audio = whisper.pad_or_trim(audio)
|
46 |
+
|
47 |
+
# make log-Mel spectrogram and move to the same device as the model
|
48 |
+
mel = whisper.log_mel_spectrogram(audio).to(model.device)
|
49 |
+
|
50 |
+
options = whisper.DecodingOptions(beam_size=5, language="ja", fp16 = False)
|
51 |
+
result = whisper.decode(model, mel, options)
|
52 |
+
|
53 |
+
text = "[JA]"+ result.text + "[JA]\n"
|
54 |
+
speaker_annos.append(save_path + "|" + speaker + "|" + text)
|
55 |
+
|
56 |
+
processed_files += 1
|
57 |
+
#print(f"Processed: {processed_files}/{total_files}")
|
58 |
+
#except:
|
59 |
+
# print(f"Error occurred: {wavfile}")
|
60 |
+
# continue
|
61 |
+
|
62 |
+
# # clean annotation
|
63 |
+
# import argparse
|
64 |
+
# import text
|
65 |
+
# from utils import load_filepaths_and_text
|
66 |
+
# for i, line in enumerate(speaker_annos):
|
67 |
+
# path, sid, txt = line.split("|")
|
68 |
+
# cleaned_text = text._clean_text(txt, ["cjke_cleaners2"])
|
69 |
+
# cleaned_text += "\n" if not cleaned_text.endswith("\n") else ""
|
70 |
+
# speaker_annos[i] = path + "|" + sid + "|" + cleaned_text
|
71 |
+
# write into annotation
|
72 |
+
if len(speaker_annos) == 0:
|
73 |
+
print("Warning: no short audios found, this IS expected if you have only uploaded long audios, videos or video links.")
|
74 |
+
print("this IS NOT expected if you have uploaded a zip file of short audios. Please check your file structure or make sure your audio language is supported.")
|
75 |
+
with open("short_character_anno.txt", 'w', encoding='utf-8') as f:
|
76 |
+
for line in speaker_annos:
|
77 |
+
f.write(line)
|
78 |
+
|
79 |
+
# import json
|
80 |
+
# # generate new config
|
81 |
+
# with open("./configs/finetune_speaker.json", 'r', encoding='utf-8') as f:
|
82 |
+
# hps = json.load(f)
|
83 |
+
# # modify n_speakers
|
84 |
+
# hps['data']["n_speakers"] = 1000 + len(speaker2id)
|
85 |
+
# # add speaker names
|
86 |
+
# for speaker in speaker_names:
|
87 |
+
# hps['speakers'][speaker] = speaker2id[speaker]
|
88 |
+
# # save modified config
|
89 |
+
# with open("./configs/modified_finetune_speaker.json", 'w', encoding='utf-8') as f:
|
90 |
+
# json.dump(hps, f, indent=2)
|
91 |
+
# print("finished")
|