Spaces:

Artificial-superintelligence
/

Algorithmvoice

Running

App Files Files Community

Artificial-superintelligence commited on Oct 17, 2024

Commit

2158d6f

verified ·

1 Parent(s): ce9b606

Update app.py

Browse files

Files changed (1) hide show

app.py +157 -160

app.py CHANGED Viewed

@@ -1,186 +1,183 @@
 import streamlit as st
-import librosa
-import soundfile as sf
-import numpy as np
-import scipy.signal as signal
-from scipy.io import wavfile
-import pyworld as world
 import torch
 import torchaudio
 from io import BytesIO
 import tempfile
-def enhance_harmonics(y, sr):
-    # Extract harmonics using harmonic-percussive source separation
-    y_harmonic = librosa.effects.hpss(y)[0]
-    # Enhance the harmonics
-    y_enhanced = y_harmonic * 1.2 + y * 0.3
-    return librosa.util.normalize(y_enhanced)
-def modify_formants(y, sr, formant_shift_factor=1.2):
-    # Get the power spectrum
-    D = librosa.stft(y)
-    S = np.abs(D)
-    # Estimate formants using LPC
-    order = 12
-    a = librosa.lpc(y, order)
-    # Shift formants
-    new_a = np.zeros_like(a)
-    new_a[0] = a[0]
-    for i in range(1, len(a)):
-        new_a[i] = a[i] * (formant_shift_factor ** i)
-    # Apply modified LPC filter
-    y_formant = signal.lfilter([1], new_a, y)
-    return librosa.util.normalize(y_formant)
-def process_audio_advanced(audio_file, settings):
-    # Load audio
-    y, sr = librosa.load(audio_file)
-    # Extract F0 and spectral envelope using WORLD vocoder
-    _f0, t = librosa.piptrack(y=y, sr=sr)
-    f0 = np.mean(_f0[_f0 > 0], axis=0)
-    # Pitch shifting with formant preservation
-    y_shifted = librosa.effects.pitch_shift(
-        y,
-        sr=sr,
-        n_steps=settings['pitch_shift']
-    )
-    # Modify formants
-    y_formant = modify_formants(
-        y_shifted,
-        sr,
-        settings['formant_shift']
-    )
-    # Enhance harmonics
-    y_harmonic = enhance_harmonics(y_formant, sr)
-    # Apply vocal tract length normalization
-    y_vtln = librosa.effects.time_stretch(
-        y_harmonic,
-        rate=settings['vtln_factor']
-    )
-    # Smooth the output
-    y_smooth = signal.savgol_filter(y_vtln, 1001, 2)
-    # Final normalization
-    y_final = librosa.util.normalize(y_smooth)
-    return y_final, sr
-def create_voice_preset(preset_name):
-    presets = {
-        'Young Female': {
-            'pitch_shift': 8.0,
-            'formant_shift': 1.3,
-            'vtln_factor': 1.1,
-            'breathiness': 0.3
-        },
-        'Mature Female': {
-            'pitch_shift': 6.0,
-            'formant_shift': 1.2,
-            'vtln_factor': 1.05,
-            'breathiness': 0.2
-        },
-        'Soft Female': {
-            'pitch_shift': 7.0,
-            'formant_shift': 1.25,
-            'vtln_factor': 1.15,
-            'breathiness': 0.4
-        }
-    }
-    return presets.get(preset_name)
-def add_breathiness(y, sr, amount=0.3):
-    # Generate breath noise
-    noise = np.random.normal(0, 0.01, len(y))
-    noise_filtered = signal.lfilter([1], [1, -0.98], noise)
-    # Mix with original signal
-    y_breathy = y * (1 - amount) + noise_filtered * amount
-    return librosa.util.normalize(y_breathy)
-st.title("Advanced Female Voice Converter")
-# File uploader
 uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
 if uploaded_file is not None:
     # Save uploaded file temporarily
     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
         tmp_file.write(uploaded_file.getvalue())
         tmp_path = tmp_file.name
-    # Voice preset selector
-    preset_name = st.selectbox(
-        "Select Voice Preset",
-        ['Young Female', 'Mature Female', 'Soft Female', 'Custom']
-    )
-    if preset_name == 'Custom':
-        settings = {
-            'pitch_shift': st.slider("Pitch Shift", 0.0, 12.0, 8.0, 0.5),
-            'formant_shift': st.slider("Formant Shift", 1.0, 1.5, 1.2, 0.05),
-            'vtln_factor': st.slider("Vocal Tract Length", 0.9, 1.2, 1.1, 0.05),
-            'breathiness': st.slider("Breathiness", 0.0, 1.0, 0.3, 0.1)
-        }
-    else:
-        settings = create_voice_preset(preset_name)
     if st.button("Convert Voice"):
-        with st.spinner("Processing audio..."):
-            try:
-                # Process audio
-                processed_audio, sr = process_audio_advanced(tmp_path, settings)
-                # Add breathiness
-                processed_audio = add_breathiness(
-                    processed_audio,
-                    sr,
-                    settings['breathiness']
                 )
-                # Save to buffer
-                buffer = BytesIO()
-                sf.write(buffer, processed_audio, sr, format='WAV')
                 # Display audio player
-                st.audio(buffer, format='audio/wav')
                 # Download button
                 st.download_button(
                     label="Download Converted Audio",
-                    data=buffer,
-                    file_name="female_voice_converted.wav",
                     mime="audio/wav"
                 )
-            except Exception as e:
-                st.error(f"Error processing audio: {str(e)}")
 st.markdown("""
-### Advanced Features:
-- Formant preservation and shifting
-- Harmonic enhancement
-- Vocal tract length normalization
-- Natural breathiness addition
-- Multiple voice presets
-- Custom parameter adjustment
 ### Tips for Best Results:
-1. Use high-quality input audio
-2. Start with presets and adjust if needed
-3. For custom settings:
-   - Pitch shift: 6-8 for natural female voice
-   - Formant shift: 1.1-1.3 for feminine resonance
-   - Vocal tract length: 1.05-1.15 for realistic results
-   - Breathiness: 0.2-0.4 for natural sound
-""")

 import streamlit as st
 import torch
 import torchaudio
+import numpy as np
+import librosa
+import soundfile as sf
+from TTS.api import TTS
+from fairseq import checkpoint_utils
+import wget
+import os
 from io import BytesIO
 import tempfile
+import huggingface_hub
+class VoiceConverter:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.load_models()
+    def load_models(self):
+        # Download pre-trained models if not exists
+        models_dir = "pretrained_models"
+        os.makedirs(models_dir, exist_ok=True)
+        # Load Coqui TTS model
+        self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
+        # Load VITS model
+        vits_path = os.path.join(models_dir, "vits_female.pth")
+        if not os.path.exists(vits_path):
+            # Download VITS pre-trained model
+            wget.download(
+                "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
+                vits_path
+            )
+        self.vits_model = torch.load(vits_path, map_location=self.device)
+        self.vits_model.eval()
+    def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
+        # Load audio
+        wav, sr = librosa.load(audio_path)
+        # Resample if needed
+        if sr != 22050:
+            wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
+            sr = 22050
+        # Convert to tensor
+        wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)
+        # Process with VITS
+        with torch.no_grad():
+            converted = self.vits_model.voice_conversion(
+                wav_tensor,
+                speaker_id=speaker_id
+            )
+        # Process with Coqui TTS for emotion
+        wav_path = "temp.wav"
+        sf.write(wav_path, converted.cpu().numpy(), sr)
+        emotional_wav = self.tts.tts_with_vc(
+            wav_path,
+            speaker_wav=wav_path,
+            emotion=emotion
+        )
+        return emotional_wav, sr
+def save_audio(audio_data, sr):
+    buffer = BytesIO()
+    sf.write(buffer, audio_data, sr, format='WAV')
+    return buffer
+# Streamlit Interface
+st.title("AI Voice Converter - Female Voice Transformation")
+# Model selection
+model_type = st.selectbox(
+    "Select Voice Model",
+    ["VITS Female", "YourTTS Female", "Mixed Model"]
+)
+# Voice character selection
+voice_character = st.selectbox(
+    "Select Voice Character",
+    ["Anime Female", "Natural Female", "Young Female", "Mature Female"]
+)
+# Emotion selection
+emotion = st.selectbox(
+    "Select Emotion",
+    ["Happy", "Sad", "Angry", "Neutral", "Excited"]
+)
+# Additional parameters
+with st.expander("Advanced Settings"):
+    pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
+    clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
+    speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)
+# File upload
 uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
 if uploaded_file is not None:
+    # Initialize converter
+    converter = VoiceConverter()
     # Save uploaded file temporarily
     with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
         tmp_file.write(uploaded_file.getvalue())
         tmp_path = tmp_file.name
     if st.button("Convert Voice"):
+        try:
+            with st.spinner("Converting voice... This may take a few moments."):
+                # Get speaker ID based on voice character
+                speaker_id = {
+                    "Anime Female": 0,
+                    "Natural Female": 1,
+                    "Young Female": 2,
+                    "Mature Female": 3
+                }[voice_character]
+                # Convert voice
+                converted_audio, sr = converter.convert_voice(
+                    tmp_path,
+                    speaker_id=speaker_id,
+                    emotion=emotion
                 )
+                # Create audio buffer
+                audio_buffer = save_audio(converted_audio, sr)
                 # Display audio player
+                st.audio(audio_buffer, format='audio/wav')
                 # Download button
                 st.download_button(
                     label="Download Converted Audio",
+                    data=audio_buffer,
+                    file_name="ai_converted_voice.wav",
                     mime="audio/wav"
                 )
+        except Exception as e:
+            st.error(f"Error during conversion: {str(e)}")
+# Add information about the models
 st.markdown("""
+### Model Information:
+1. **VITS Female**: Pre-trained on a large dataset of female voices
+2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
+3. **Mixed Model**: Combination of multiple models for better quality
+### Voice Characters:
+- **Anime Female**: High-pitched, animated style voice
+- **Natural Female**: Realistic female voice
+- **Young Female**: Young adult female voice
+- **Mature Female**: Mature female voice
 ### Tips for Best Results:
+- Use clear audio input with minimal background noise
+- Short audio clips (5-30 seconds) work best
+- Experiment with different emotions and voice characters
+- Adjust advanced settings for fine-tuning
+""")
+# Requirements
+"""
+pip install requirements:
+TTS
+fairseq
+torch
+torchaudio
+streamlit
+librosa
+soundfile
+numpy
+wget
+huggingface_hub
+"""