|
import streamlit as st |
|
import torch |
|
import torchaudio |
|
import numpy as np |
|
import librosa |
|
import soundfile as sf |
|
from TTS.api import TTS |
|
from fairseq import checkpoint_utils |
|
import wget |
|
import os |
|
from io import BytesIO |
|
import tempfile |
|
import huggingface_hub |
|
|
|
class VoiceConverter: |
|
def __init__(self): |
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
self.load_models() |
|
|
|
def load_models(self): |
|
|
|
models_dir = "pretrained_models" |
|
os.makedirs(models_dir, exist_ok=True) |
|
|
|
|
|
self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False) |
|
|
|
|
|
vits_path = os.path.join(models_dir, "vits_female.pth") |
|
if not os.path.exists(vits_path): |
|
|
|
wget.download( |
|
"https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth", |
|
vits_path |
|
) |
|
|
|
self.vits_model = torch.load(vits_path, map_location=self.device) |
|
self.vits_model.eval() |
|
|
|
def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"): |
|
|
|
wav, sr = librosa.load(audio_path) |
|
|
|
|
|
if sr != 22050: |
|
wav = librosa.resample(wav, orig_sr=sr, target_sr=22050) |
|
sr = 22050 |
|
|
|
|
|
wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device) |
|
|
|
|
|
with torch.no_grad(): |
|
converted = self.vits_model.voice_conversion( |
|
wav_tensor, |
|
speaker_id=speaker_id |
|
) |
|
|
|
|
|
wav_path = "temp.wav" |
|
sf.write(wav_path, converted.cpu().numpy(), sr) |
|
|
|
emotional_wav = self.tts.tts_with_vc( |
|
wav_path, |
|
speaker_wav=wav_path, |
|
emotion=emotion |
|
) |
|
|
|
return emotional_wav, sr |
|
|
|
def save_audio(audio_data, sr): |
|
buffer = BytesIO() |
|
sf.write(buffer, audio_data, sr, format='WAV') |
|
return buffer |
|
|
|
|
|
st.title("AI Voice Converter - Female Voice Transformation") |
|
|
|
|
|
model_type = st.selectbox( |
|
"Select Voice Model", |
|
["VITS Female", "YourTTS Female", "Mixed Model"] |
|
) |
|
|
|
|
|
voice_character = st.selectbox( |
|
"Select Voice Character", |
|
["Anime Female", "Natural Female", "Young Female", "Mature Female"] |
|
) |
|
|
|
|
|
emotion = st.selectbox( |
|
"Select Emotion", |
|
["Happy", "Sad", "Angry", "Neutral", "Excited"] |
|
) |
|
|
|
|
|
with st.expander("Advanced Settings"): |
|
pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0) |
|
clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8) |
|
speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0) |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3']) |
|
|
|
if uploaded_file is not None: |
|
|
|
converter = VoiceConverter() |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file: |
|
tmp_file.write(uploaded_file.getvalue()) |
|
tmp_path = tmp_file.name |
|
|
|
if st.button("Convert Voice"): |
|
try: |
|
with st.spinner("Converting voice... This may take a few moments."): |
|
|
|
speaker_id = { |
|
"Anime Female": 0, |
|
"Natural Female": 1, |
|
"Young Female": 2, |
|
"Mature Female": 3 |
|
}[voice_character] |
|
|
|
|
|
converted_audio, sr = converter.convert_voice( |
|
tmp_path, |
|
speaker_id=speaker_id, |
|
emotion=emotion |
|
) |
|
|
|
|
|
audio_buffer = save_audio(converted_audio, sr) |
|
|
|
|
|
st.audio(audio_buffer, format='audio/wav') |
|
|
|
|
|
st.download_button( |
|
label="Download Converted Audio", |
|
data=audio_buffer, |
|
file_name="ai_converted_voice.wav", |
|
mime="audio/wav" |
|
) |
|
|
|
except Exception as e: |
|
st.error(f"Error during conversion: {str(e)}") |
|
|
|
|
|
st.markdown(""" |
|
### Model Information: |
|
1. **VITS Female**: Pre-trained on a large dataset of female voices |
|
2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model |
|
3. **Mixed Model**: Combination of multiple models for better quality |
|
|
|
### Voice Characters: |
|
- **Anime Female**: High-pitched, animated style voice |
|
- **Natural Female**: Realistic female voice |
|
- **Young Female**: Young adult female voice |
|
- **Mature Female**: Mature female voice |
|
|
|
### Tips for Best Results: |
|
- Use clear audio input with minimal background noise |
|
- Short audio clips (5-30 seconds) work best |
|
- Experiment with different emotions and voice characters |
|
- Adjust advanced settings for fine-tuning |
|
""") |
|
|
|
|
|
""" |
|
pip install requirements: |
|
TTS |
|
fairseq |
|
torch |
|
torchaudio |
|
streamlit |
|
librosa |
|
soundfile |
|
numpy |
|
wget |
|
huggingface_hub |
|
""" |