Hemant0000 commited on
Commit
6544c6d
·
verified ·
1 Parent(s): cb84bf3

Create speech_edit.py

Browse files
Files changed (1) hide show
  1. speech_edit.py +189 -0
speech_edit.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import torchaudio
6
+ from vocos import Vocos
7
+
8
+ from model import CFM, UNetT, DiT
9
+ from model.utils import (
10
+ load_checkpoint,
11
+ get_tokenizer,
12
+ convert_char_to_pinyin,
13
+ save_spectrogram,
14
+ )
15
+
16
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
17
+
18
+
19
+ # --------------------- Dataset Settings -------------------- #
20
+
21
+ target_sample_rate = 24000
22
+ n_mel_channels = 100
23
+ hop_length = 256
24
+ target_rms = 0.1
25
+
26
+ tokenizer = "pinyin"
27
+ dataset_name = "Emilia_ZH_EN"
28
+
29
+
30
+ # ---------------------- infer setting ---------------------- #
31
+
32
+ seed = None # int | None
33
+
34
+ exp_name = "F5TTS_Base" # F5TTS_Base | E2TTS_Base
35
+ ckpt_step = 1200000
36
+
37
+ nfe_step = 32 # 16, 32
38
+ cfg_strength = 2.0
39
+ ode_method = "euler" # euler | midpoint
40
+ sway_sampling_coef = -1.0
41
+ speed = 1.0
42
+
43
+ if exp_name == "F5TTS_Base":
44
+ model_cls = DiT
45
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
46
+
47
+ elif exp_name == "E2TTS_Base":
48
+ model_cls = UNetT
49
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
50
+
51
+ ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.safetensors"
52
+ output_dir = "tests"
53
+
54
+ # [leverage https://github.com/MahmoudAshraf97/ctc-forced-aligner to get char level alignment]
55
+ # pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
56
+ # [write the origin_text into a file, e.g. tests/test_edit.txt]
57
+ # ctc-forced-aligner --audio_path "tests/ref_audio/test_en_1_ref_short.wav" --text_path "tests/test_edit.txt" --language "zho" --romanize --split_size "char"
58
+ # [result will be saved at same path of audio file]
59
+ # [--language "zho" for Chinese, "eng" for English]
60
+ # [if local ckpt, set --alignment_model "../checkpoints/mms-300m-1130-forced-aligner"]
61
+
62
+ audio_to_edit = "tests/ref_audio/test_en_1_ref_short.wav"
63
+ origin_text = "Some call me nature, others call me mother nature."
64
+ target_text = "Some call me optimist, others call me realist."
65
+ parts_to_edit = [
66
+ [1.42, 2.44],
67
+ [4.04, 4.9],
68
+ ] # stard_ends of "nature" & "mother nature", in seconds
69
+ fix_duration = [
70
+ 1.2,
71
+ 1,
72
+ ] # fix duration for "optimist" & "realist", in seconds
73
+
74
+ # audio_to_edit = "tests/ref_audio/test_zh_1_ref_short.wav"
75
+ # origin_text = "对,这就是我,万人敬仰的太乙真人。"
76
+ # target_text = "对,那就是你,万人敬仰的太白金星。"
77
+ # parts_to_edit = [[0.84, 1.4], [1.92, 2.4], [4.26, 6.26], ]
78
+ # fix_duration = None # use origin text duration
79
+
80
+
81
+ # -------------------------------------------------#
82
+
83
+ use_ema = True
84
+
85
+ if not os.path.exists(output_dir):
86
+ os.makedirs(output_dir)
87
+
88
+ # Vocoder model
89
+ local = False
90
+ if local:
91
+ vocos_local_path = "../checkpoints/charactr/vocos-mel-24khz"
92
+ vocos = Vocos.from_hparams(f"{vocos_local_path}/config.yaml")
93
+ state_dict = torch.load(f"{vocos_local_path}/pytorch_model.bin", weights_only=True, map_location=device)
94
+ vocos.load_state_dict(state_dict)
95
+
96
+ vocos.eval()
97
+ else:
98
+ vocos = Vocos.from_pretrained("charactr/vocos-mel-24khz")
99
+
100
+ # Tokenizer
101
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
102
+
103
+ # Model
104
+ model = CFM(
105
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
106
+ mel_spec_kwargs=dict(
107
+ target_sample_rate=target_sample_rate,
108
+ n_mel_channels=n_mel_channels,
109
+ hop_length=hop_length,
110
+ ),
111
+ odeint_kwargs=dict(
112
+ method=ode_method,
113
+ ),
114
+ vocab_char_map=vocab_char_map,
115
+ ).to(device)
116
+
117
+ model = load_checkpoint(model, ckpt_path, device, use_ema=use_ema)
118
+
119
+ # Audio
120
+ audio, sr = torchaudio.load(audio_to_edit)
121
+ if audio.shape[0] > 1:
122
+ audio = torch.mean(audio, dim=0, keepdim=True)
123
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
124
+ if rms < target_rms:
125
+ audio = audio * target_rms / rms
126
+ if sr != target_sample_rate:
127
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
128
+ audio = resampler(audio)
129
+ offset = 0
130
+ audio_ = torch.zeros(1, 0)
131
+ edit_mask = torch.zeros(1, 0, dtype=torch.bool)
132
+ for part in parts_to_edit:
133
+ start, end = part
134
+ part_dur = end - start if fix_duration is None else fix_duration.pop(0)
135
+ part_dur = part_dur * target_sample_rate
136
+ start = start * target_sample_rate
137
+ audio_ = torch.cat((audio_, audio[:, round(offset) : round(start)], torch.zeros(1, round(part_dur))), dim=-1)
138
+ edit_mask = torch.cat(
139
+ (
140
+ edit_mask,
141
+ torch.ones(1, round((start - offset) / hop_length), dtype=torch.bool),
142
+ torch.zeros(1, round(part_dur / hop_length), dtype=torch.bool),
143
+ ),
144
+ dim=-1,
145
+ )
146
+ offset = end * target_sample_rate
147
+ # audio = torch.cat((audio_, audio[:, round(offset):]), dim = -1)
148
+ edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True)
149
+ audio = audio.to(device)
150
+ edit_mask = edit_mask.to(device)
151
+
152
+ # Text
153
+ text_list = [target_text]
154
+ if tokenizer == "pinyin":
155
+ final_text_list = convert_char_to_pinyin(text_list)
156
+ else:
157
+ final_text_list = [text_list]
158
+ print(f"text : {text_list}")
159
+ print(f"pinyin: {final_text_list}")
160
+
161
+ # Duration
162
+ ref_audio_len = 0
163
+ duration = audio.shape[-1] // hop_length
164
+
165
+ # Inference
166
+ with torch.inference_mode():
167
+ generated, trajectory = model.sample(
168
+ cond=audio,
169
+ text=final_text_list,
170
+ duration=duration,
171
+ steps=nfe_step,
172
+ cfg_strength=cfg_strength,
173
+ sway_sampling_coef=sway_sampling_coef,
174
+ seed=seed,
175
+ edit_mask=edit_mask,
176
+ )
177
+ print(f"Generated mel: {generated.shape}")
178
+
179
+ # Final result
180
+ generated = generated.to(torch.float32)
181
+ generated = generated[:, ref_audio_len:, :]
182
+ generated_mel_spec = generated.permute(0, 2, 1)
183
+ generated_wave = vocos.decode(generated_mel_spec.cpu())
184
+ if rms < target_rms:
185
+ generated_wave = generated_wave * rms / target_rms
186
+
187
+ save_spectrogram(generated_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
188
+ torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave, target_sample_rate)
189
+ print(f"Generated wav: {generated_wave.shape}")