songclon

Runtime error

App Files Files Community

aiqtech commited on Sep 26, 2024

Commit

d544710

verified ·

1 Parent(s): 2094801

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -34

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import numpy as np
 from pydub import AudioSegment
 import spaces
-# Load model and configuration
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
@@ -22,7 +22,7 @@ model = build_model(model_params, stage='DiT')
 hop_length = config['preprocess_params']['spect_params']['hop_length']
 sr = config['preprocess_params']['sr']
-# Load checkpoints
 model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
                                  load_only_params=True, ignore_modules=[], is_distributed=False)
 for key in model:
@@ -30,7 +30,7 @@ for key in model:
     model[key].to(device)
 model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
-# Load additional modules
 from modules.campplus.DTDNN import CAMPPlus
 campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
@@ -55,7 +55,7 @@ from modules.bigvgan import bigvgan
 bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
-# remove weight norm in the model and set to eval mode
 bigvgan_model.remove_weight_norm()
 bigvgan_model = bigvgan_model.eval().to(device)
@@ -79,7 +79,7 @@ elif speech_tokenizer_type == 'facodec':
     _ = [codec_encoder[key].eval() for key in codec_encoder]
     _ = [codec_encoder[key].to(device) for key in codec_encoder]
-# Generate mel spectrograms
 mel_fn_args = {
     "n_fft": config['preprocess_params']['spect_params']['n_fft'],
     "win_size": config['preprocess_params']['spect_params']['win_length'],
@@ -105,7 +105,7 @@ from modules.audio import mel_spectrogram
 to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
 to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
-# f0 conditioned model
 dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
                                                 "DiT_seed_v2_uvit_facodec_small_wavenet_f0_bigvgan_pruned.pth",
                                                 "config_dit_mel_seed_facodec_small_wavenet_f0.yml")
@@ -116,7 +116,7 @@ model_f0 = build_model(model_params, stage='DiT')
 hop_length = config['preprocess_params']['spect_params']['hop_length']
 sr = config['preprocess_params']['sr']
-# Load checkpoints
 model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
                                  load_only_params=True, ignore_modules=[], is_distributed=False)
 for key in model_f0:
@@ -124,7 +124,7 @@ for key in model_f0:
     model_f0[key].to(device)
 model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
-# f0 extractor
 from modules.rmvpe import RMVPE
 model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
@@ -140,7 +140,7 @@ def crossfade(chunk1, chunk2, overlap):
     chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
     return chunk2
-# streaming and chunk processing related params
 max_context_window = sr // hop_length * 30
 overlap_frame_len = 64
 overlap_wave_len = overlap_frame_len * hop_length
@@ -152,19 +152,19 @@ bitrate = "320k"
 def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, n_quantizers, f0_condition, auto_f0_adjust, pitch_shift):
     inference_module = model if not f0_condition else model_f0
     mel_fn = to_mel if not f0_condition else to_mel_f0
-    # Load audio
     source_audio = librosa.load(source, sr=sr)[0]
     ref_audio = librosa.load(target, sr=sr)[0]
-    # Process audio
     source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
     ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
-    # Resample
     source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
     ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
-    # Extract features
     if speech_tokenizer_type == 'cosyvoice':
         S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
         S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
@@ -189,7 +189,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
             S_alt_chunks.append(S_alt)
         S_alt = torch.cat(S_alt_chunks, dim=-1)
-        # S_ori should be extracted in the same way
         waves_24k = torchaudio.functional.resample(ref_audio, sr, 24000)
         waves_input = waves_24k.unsqueeze(1)
         z = codec_encoder.encoder(waves_input)
@@ -235,7 +235,7 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
         # mean_log_f0_ori = torch.mean(voiced_log_f0_ori)
         # mean_log_f0_alt = torch.mean(voiced_log_f0_alt)
-        # shift alt log f0 level to ori log f0 level
         shifted_log_f0_alt = log_f0_alt.clone()
         if auto_f0_adjust:
             shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
@@ -247,20 +247,20 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
         F0_alt = None
         shifted_f0_alt = None
-    # Length regulation
     cond = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=int(n_quantizers), f0=shifted_f0_alt)[0]
     prompt_condition = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=int(n_quantizers), f0=F0_ori)[0]
     max_source_window = max_context_window - mel2.size(2)
-    # split source condition (cond) into chunks
     processed_frames = 0
     generated_wave_chunks = []
-    # generate chunk by chunk and stream the output
     while processed_frames < cond.size(1):
         chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
         is_last_chunk = processed_frames + max_source_window >= cond.size(1)
         cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
-        # Voice Conversion
         vc_target = inference_module.cfm.inference(cat_condition,
                                                    torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                                                    mel2, style2, None, diffusion_steps,
@@ -316,19 +316,19 @@ def voice_conversion(source, target, diffusion_steps, length_adjust, inference_c
 if __name__ == "__main__":
-    description = ("레퍼런스 음악은 25초 이내 최대 30초 미만으로 업로드 바랍니다."
     inputs = [
-        gr.Audio(type="filepath", label="Source Audio / 源音频"),
-        gr.Audio(type="filepath", label="Reference Audio / 参考音频"),
-        gr.Slider(minimum=1, maximum=200, value=10, step=1, label="Diffusion Steps / 扩散步数", info="10 by default, 50~100 for best quality / 默认为 10，50~100 为最佳质量"),
-        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Length Adjust / 长度调整", info="<1.0 for speed-up speech, >1.0 for slow-down speech / <1.0 加速语速，>1.0 减慢语速"),
-        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="Inference CFG Rate", info="has subtle influence / 有微小影响"),
-        gr.Slider(minimum=1, maximum=3, step=1, value=3, label="N FAcodec Quantizers / FAcodec码本数量", info="the less FAcodec quantizer used, the less prosody of source audio is preserved / 使用的FAcodec码本越少，源音频的韵律保留越少"),
-        gr.Checkbox(label="Use F0 conditioned model / 启用F0输入", value=False, info="Must set to true for singing voice conversion / 歌声转换时必须勾选"),
-        gr.Checkbox(label="Auto F0 adjust / 自动F0调整", value=True,
-                    info="Roughly adjust F0 to match target voice. Only works when F0 conditioned model is used. / 粗略调整 F0 以匹配目标音色，仅在勾选 '启用F0输入' 时生效"),
-        gr.Slider(label='Pitch shift / 音调变换', minimum=-24, maximum=24, step=1, value=0, info="Pitch shift in semitones, only works when F0 conditioned model is used / 半音数的音高变换，仅在勾选 '启用F0输入' 时生效"),
     ]
     examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, 1, False, True, 0],
@@ -339,14 +339,14 @@ if __name__ == "__main__":
                  "examples/reference/trump_0.wav", 50, 1.0, 0.7, 1, True, False, -12],
                 ]
-    outputs = [gr.Audio(label="Stream Output Audio / 流式输出", streaming=True, format='mp3'),
-               gr.Audio(label="Full Output Audio / 完整输出", streaming=False, format='wav')]
     gr.Interface(fn=voice_conversion,
                  description=description,
                  inputs=inputs,
                  outputs=outputs,
-                 title="Seed Voice Conversion",
                  examples=examples,
                  cache_examples=False,
-                 ).launch()

 from pydub import AudioSegment
 import spaces
+# 모델 및 설정 로드
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
 hop_length = config['preprocess_params']['spect_params']['hop_length']
 sr = config['preprocess_params']['sr']
+# 체크포인트 로드
 model, _, _, _ = load_checkpoint(model, None, dit_checkpoint_path,
                                  load_only_params=True, ignore_modules=[], is_distributed=False)
 for key in model:
     model[key].to(device)
 model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
+# 추가 모듈 로드
 from modules.campplus.DTDNN import CAMPPlus
 campplus_ckpt_path = load_custom_model_from_hf("funasr/campplus", "campplus_cn_common.bin", config_filename=None)
 bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_22khz_80band_256x', use_cuda_kernel=False)
+# 모델에서 weight norm을 제거하고 eval 모드로 설정
 bigvgan_model.remove_weight_norm()
 bigvgan_model = bigvgan_model.eval().to(device)
     _ = [codec_encoder[key].eval() for key in codec_encoder]
     _ = [codec_encoder[key].to(device) for key in codec_encoder]
+# mel 스펙트로그램 생성
 mel_fn_args = {
     "n_fft": config['preprocess_params']['spect_params']['n_fft'],
     "win_size": config['preprocess_params']['spect_params']['win_length'],
 to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)
 to_mel_f0 = lambda x: mel_spectrogram(x, **mel_fn_args_f0)
+# f0 조건부 모델
 dit_checkpoint_path, dit_config_path = load_custom_model_from_hf("Plachta/Seed-VC",
                                                 "DiT_seed_v2_uvit_facodec_small_wavenet_f0_bigvgan_pruned.pth",
                                                 "config_dit_mel_seed_facodec_small_wavenet_f0.yml")
 hop_length = config['preprocess_params']['spect_params']['hop_length']
 sr = config['preprocess_params']['sr']
+# 체크포인트 로드
 model_f0, _, _, _ = load_checkpoint(model_f0, None, dit_checkpoint_path,
                                  load_only_params=True, ignore_modules=[], is_distributed=False)
 for key in model_f0:
     model_f0[key].to(device)
 model_f0.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
+# f0 추출기
 from modules.rmvpe import RMVPE
 model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
     chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
     return chunk2
+# 스트리밍 및 청크 처리 관련 매개변수
 max_context_window = sr // hop_length * 30
 overlap_frame_len = 64
 overlap_wave_len = overlap_frame_len * hop_length
 def voice_conversion(source, target, diffusion_steps, length_adjust, inference_cfg_rate, n_quantizers, f0_condition, auto_f0_adjust, pitch_shift):
     inference_module = model if not f0_condition else model_f0
     mel_fn = to_mel if not f0_condition else to_mel_f0
+    # 오디오 로드
     source_audio = librosa.load(source, sr=sr)[0]
     ref_audio = librosa.load(target, sr=sr)[0]
+    # 오디오 처리
     source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
     ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)
+    # 리샘플링
     source_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
     ref_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
+    # 특성 추출
     if speech_tokenizer_type == 'cosyvoice':
         S_alt = cosyvoice_frontend.extract_speech_token(source_waves_16k)[0]
         S_ori = cosyvoice_frontend.extract_speech_token(ref_waves_16k)[0]
             S_alt_chunks.append(S_alt)
         S_alt = torch.cat(S_alt_chunks, dim=-1)
+        # S_ori도 동일한 방식으로 추출해야 함
         waves_24k = torchaudio.functional.resample(ref_audio, sr, 24000)
         waves_input = waves_24k.unsqueeze(1)
         z = codec_encoder.encoder(waves_input)
         # mean_log_f0_ori = torch.mean(voiced_log_f0_ori)
         # mean_log_f0_alt = torch.mean(voiced_log_f0_alt)
+        # alt log f0 레벨을 ori log f0 레벨로 이동
         shifted_log_f0_alt = log_f0_alt.clone()
         if auto_f0_adjust:
             shifted_log_f0_alt[F0_alt > 1] = log_f0_alt[F0_alt > 1] - median_log_f0_alt + median_log_f0_ori
         F0_alt = None
         shifted_f0_alt = None
+    # 길이 조절
     cond = inference_module.length_regulator(S_alt, ylens=target_lengths, n_quantizers=int(n_quantizers), f0=shifted_f0_alt)[0]
     prompt_condition = inference_module.length_regulator(S_ori, ylens=target2_lengths, n_quantizers=int(n_quantizers), f0=F0_ori)[0]
     max_source_window = max_context_window - mel2.size(2)
+    # 소스 조건(cond)을 청크로 분할
     processed_frames = 0
     generated_wave_chunks = []
+    # 청크별로 생성하고 출력을 스트리밍
     while processed_frames < cond.size(1):
         chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
         is_last_chunk = processed_frames + max_source_window >= cond.size(1)
         cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
+        # 음성 변환
         vc_target = inference_module.cfm.inference(cat_condition,
                                                    torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                                                    mel2, style2, None, diffusion_steps,
 if __name__ == "__main__":
+    description = ("레퍼런스 음악은 25초 이내 최대 30초 미만으로 업로드 바랍니다.")
     inputs = [
+        gr.Audio(type="filepath", label="음악 업로드"),
+        gr.Audio(type="filepath", label="음성 업로드"),
+        gr.Slider(minimum=1, maximum=200, value=10, step=1, label="확산 단계", info="기본값은 10, 최상의 품질을 위해서는 50~100"),
+        gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="길이 조정", info="<1.0 빠른 음성, >1.0 느린 음성"),
+        gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.7, label="추론 CFG 비율", info="미묘한 영향이 있음"),
+        gr.Slider(minimum=1, maximum=3, step=1, value=3, label="FAcodec 양자화기 수", info="사용하는 FAcodec 양자화기가 적을수록 원본 오디오의 운율이 덜 보존됨"),
+        gr.Checkbox(label="F0 조건부 모델 사용", value=False, info="노래 음성 변환을 위해서는 반드시 체크해야 함"),
+        gr.Checkbox(label="자동 F0 조정", value=True,
+                    info="목표 음색에 맞게 F0를 대략적으로 조정. F0 조건부 모델 사용 시에만 작동"),
+        gr.Slider(label='음조 변경', minimum=-24, maximum=24, step=1, value=0, info="반음 단위의 음조 변경, F0 조건부 모델 사용 시에만 작동"),
     ]
     examples = [["examples/source/yae_0.wav", "examples/reference/dingzhen_0.wav", 25, 1.0, 0.7, 1, False, True, 0],
                  "examples/reference/trump_0.wav", 50, 1.0, 0.7, 1, True, False, -12],
                 ]
+    outputs = [gr.Audio(label="스트리밍 출력 오디오", streaming=True, format='mp3'),
+               gr.Audio(label="전체 출력 오디오", streaming=False, format='wav')]
     gr.Interface(fn=voice_conversion,
                  description=description,
                  inputs=inputs,
                  outputs=outputs,
+                 title="Seed 음성 변환",
                  examples=examples,
                  cache_examples=False,
+                 ).launch()