Spaces:

cantabile-kwok
/

vec2wav2.0-demo

Running

cantabile-kwok commited on Nov 12, 2024

Commit

eda4a4a

1 Parent(s): 17d49c8

change UI and auto convert to mono

Files changed (2) hide show

app.py CHANGED Viewed

@@ -23,7 +23,10 @@ def create_interface():
     )
     with gr.Blocks(title="Voice Conversion") as demo:
         gr.Markdown("# vec2wav 2.0 Voice Conversion Demo")
-        gr.Markdown("Upload source audio and target speaker audio to convert the voice.")
         with gr.Row():
             source_audio = gr.Audio(label="Source Audio", type="filepath")

     )
     with gr.Blocks(title="Voice Conversion") as demo:
         gr.Markdown("# vec2wav 2.0 Voice Conversion Demo")
+        gr.Markdown("Upload source audio and target speaker audio to convert the voice.\n"
+                    "Note that this space could be slow since it's running on a free CPU server. We recommend running this locally for faster results.\n"
+                    "For more information, visit the [vec2wav 2.0 GitHub repository](https://github.com/cantabile-kwok/vec2wav2.0)\n"
+                    "MPEG format is not supported. Please convert it to WAV format before uploading.")
         with gr.Row():
             source_audio = gr.Audio(label="Source Audio", type="filepath")

vec2wav2/utils/utils.py CHANGED Viewed

@@ -28,17 +28,22 @@ def read_wav_16k(audio_path):
         sr = audio_path[0]
     else:  # Regular file path
         assert os.path.exists(audio_path), f"File not found: {audio_path}"
-        wav, sr = sf.read(audio_path)
-    if sr != 16000:
-        audio_tensor = torch.tensor(wav, dtype=torch.float32)
-        resampler = transforms.Resample(orig_freq=sr, new_freq=16000)
-        wav = resampler(audio_tensor)
-        wav = wav.numpy()
     return wav
 def find_files(root_dir, query="*.wav", include_root_dir=True):
     """Find files recursively.

         sr = audio_path[0]
     else:  # Regular file path
         assert os.path.exists(audio_path), f"File not found: {audio_path}"
+        if audio_path.endswith(".wav"):
+            wav, sr = sf.read(audio_path)
+            if wav.ndim > 1:
+                wav = wav.mean(axis=-1)  # Convert to mono
+            if sr != 16000:
+                audio_tensor = torch.tensor(wav, dtype=torch.float32)
+                resampler = transforms.Resample(orig_freq=sr, new_freq=16000)
+                wav = resampler(audio_tensor)
+                wav = wav.numpy()
+        else:
+            import librosa
+            wav, sr = librosa.load(audio_path, sr=16000, mono=True)
     return wav
 def find_files(root_dir, query="*.wav", include_root_dir=True):
     """Find files recursively.