Staticaliza commited on
Commit
8615f3d
·
verified ·
1 Parent(s): 31f10c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -39,6 +39,9 @@ torch.set_grad_enabled(False)
39
  device = torch.device("cpu")
40
  print(f"[DEVICE] | Using device: {device}")
41
 
 
 
 
42
  # ----------------------------
43
  # Load Models and Configuration
44
  # ----------------------------
@@ -75,7 +78,7 @@ model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
75
 
76
  # Ensure 'CAMPPlus' is correctly imported and defined
77
  try:
78
- campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
79
  print("[INFO] | CAMPPlus model instantiated.")
80
  except NameError:
81
  print("[ERROR] | CAMPPlus is not defined. Please check the import path and ensure CAMPPlus is correctly defined.")
@@ -90,7 +93,7 @@ campplus_model.to(device)
90
  print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
91
 
92
  # Load BigVGAN model
93
- bigvgan_model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_24khz_100band', use_cuda_kernel=False)
94
  bigvgan_model.remove_weight_norm()
95
  bigvgan_model = bigvgan_model.eval().to(device)
96
  print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
@@ -118,7 +121,7 @@ mel_fn_args = {
118
  "n_fft": 1024,
119
  "win_size": 1024,
120
  "hop_size": 256,
121
- "num_mels": 80,
122
  "sampling_rate": sr,
123
  "fmin": 0,
124
  "fmax": None,
@@ -153,7 +156,7 @@ mel_fn_args_f0 = {
153
  "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
154
  "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
155
  "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
156
- "num_mels": 80, # Ensure this matches the primary model
157
  "sampling_rate": sr_f0,
158
  "fmin": 0,
159
  "fmax": None,
@@ -273,7 +276,7 @@ def voice_conversion(input, reference, steps, guidance, pitch, speed):
273
 
274
  # Extract style features
275
  print("[INFO] | Extracting style features from reference audio.")
276
- feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=80, dither=0, sample_frequency=sampling_rate)
277
  feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
278
  style2 = campplus_model(feat2.unsqueeze(0))
279
  print(f"[INFO] | Style2 shape: {style2.shape}")
 
39
  device = torch.device("cpu")
40
  print(f"[DEVICE] | Using device: {device}")
41
 
42
+ channel_numbers = 100 # 80 by default
43
+ main_model = "nvidia/bigvgan_24khz_100band" # nvidia/bigvgan_v2_22khz_80band_256x
44
+
45
  # ----------------------------
46
  # Load Models and Configuration
47
  # ----------------------------
 
78
 
79
  # Ensure 'CAMPPlus' is correctly imported and defined
80
  try:
81
+ campplus_model = CAMPPlus(feat_dim=channel_numbers, embedding_size=192)
82
  print("[INFO] | CAMPPlus model instantiated.")
83
  except NameError:
84
  print("[ERROR] | CAMPPlus is not defined. Please check the import path and ensure CAMPPlus is correctly defined.")
 
93
  print("[INFO] | CAMPPlus model loaded, set to eval mode, and moved to CPU.")
94
 
95
  # Load BigVGAN model
96
+ bigvgan_model = bigvgan.BigVGAN.from_pretrained(main_model, use_cuda_kernel=False)
97
  bigvgan_model.remove_weight_norm()
98
  bigvgan_model = bigvgan_model.eval().to(device)
99
  print("[INFO] | BigVGAN model loaded, weight norm removed, set to eval mode, and moved to CPU.")
 
121
  "n_fft": 1024,
122
  "win_size": 1024,
123
  "hop_size": 256,
124
+ "num_mels": channel_numbers,
125
  "sampling_rate": sr,
126
  "fmin": 0,
127
  "fmax": None,
 
156
  "n_fft": config_f0['preprocess_params']['spect_params']['n_fft'],
157
  "win_size": config_f0['preprocess_params']['spect_params']['win_length'],
158
  "hop_size": config_f0['preprocess_params']['spect_params']['hop_length'],
159
+ "num_mels": channel_numbers,
160
  "sampling_rate": sr_f0,
161
  "fmin": 0,
162
  "fmax": None,
 
276
 
277
  # Extract style features
278
  print("[INFO] | Extracting style features from reference audio.")
279
+ feat2 = torchaudio.compliance.kaldi.fbank(ref_waves_16k, num_mel_bins=channel_numbers, dither=0, sample_frequency=sampling_rate)
280
  feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
281
  style2 = campplus_model(feat2.unsqueeze(0))
282
  print(f"[INFO] | Style2 shape: {style2.shape}")