Spaces:

Artificial-superintelligence
/

Algorithmvoice

Running

App Files Files Community

Algorithmvoice / app.py

Artificial-superintelligence

Update app.py

2158d6f verified 9 months ago

raw

history blame

5.36 kB

	import streamlit as st
	import torch
	import torchaudio
	import numpy as np
	import librosa
	import soundfile as sf
	from TTS.api import TTS
	from fairseq import checkpoint_utils
	import wget
	import os
	from io import BytesIO
	import tempfile
	import huggingface_hub

	class VoiceConverter:
	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.load_models()

	def load_models(self):
	# Download pre-trained models if not exists
	models_dir = "pretrained_models"
	os.makedirs(models_dir, exist_ok=True)

	# Load Coqui TTS model
	self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)

	# Load VITS model
	vits_path = os.path.join(models_dir, "vits_female.pth")
	if not os.path.exists(vits_path):
	# Download VITS pre-trained model
	wget.download(
	"https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
	vits_path
	)

	self.vits_model = torch.load(vits_path, map_location=self.device)
	self.vits_model.eval()

	def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
	# Load audio
	wav, sr = librosa.load(audio_path)

	# Resample if needed
	if sr != 22050:
	wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
	sr = 22050

	# Convert to tensor
	wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)

	# Process with VITS
	with torch.no_grad():
	converted = self.vits_model.voice_conversion(
	wav_tensor,
	speaker_id=speaker_id
	)

	# Process with Coqui TTS for emotion
	wav_path = "temp.wav"
	sf.write(wav_path, converted.cpu().numpy(), sr)

	emotional_wav = self.tts.tts_with_vc(
	wav_path,
	speaker_wav=wav_path,
	emotion=emotion
	)

	return emotional_wav, sr

	def save_audio(audio_data, sr):
	buffer = BytesIO()
	sf.write(buffer, audio_data, sr, format='WAV')
	return buffer

	# Streamlit Interface
	st.title("AI Voice Converter - Female Voice Transformation")

	# Model selection
	model_type = st.selectbox(
	"Select Voice Model",
	["VITS Female", "YourTTS Female", "Mixed Model"]
	)

	# Voice character selection
	voice_character = st.selectbox(
	"Select Voice Character",
	["Anime Female", "Natural Female", "Young Female", "Mature Female"]
	)

	# Emotion selection
	emotion = st.selectbox(
	"Select Emotion",
	["Happy", "Sad", "Angry", "Neutral", "Excited"]
	)

	# Additional parameters
	with st.expander("Advanced Settings"):
	pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
	clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
	speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)

	# File upload
	uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])

	if uploaded_file is not None:
	# Initialize converter
	converter = VoiceConverter()

	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
	tmp_file.write(uploaded_file.getvalue())
	tmp_path = tmp_file.name

	if st.button("Convert Voice"):
	try:
	with st.spinner("Converting voice... This may take a few moments."):
	# Get speaker ID based on voice character
	speaker_id = {
	"Anime Female": 0,
	"Natural Female": 1,
	"Young Female": 2,
	"Mature Female": 3
	}[voice_character]

	# Convert voice
	converted_audio, sr = converter.convert_voice(
	tmp_path,
	speaker_id=speaker_id,
	emotion=emotion
	)

	# Create audio buffer
	audio_buffer = save_audio(converted_audio, sr)

	# Display audio player
	st.audio(audio_buffer, format='audio/wav')

	# Download button
	st.download_button(
	label="Download Converted Audio",
	data=audio_buffer,
	file_name="ai_converted_voice.wav",
	mime="audio/wav"
	)

	except Exception as e:
	st.error(f"Error during conversion: {str(e)}")

	# Add information about the models
	st.markdown("""
	### Model Information:
	1. VITS Female: Pre-trained on a large dataset of female voices
	2. YourTTS: Multi-speaker, multi-lingual voice conversion model
	3. Mixed Model: Combination of multiple models for better quality

	### Voice Characters:
	- Anime Female: High-pitched, animated style voice
	- Natural Female: Realistic female voice
	- Young Female: Young adult female voice
	- Mature Female: Mature female voice

	### Tips for Best Results:
	- Use clear audio input with minimal background noise
	- Short audio clips (5-30 seconds) work best
	- Experiment with different emotions and voice characters
	- Adjust advanced settings for fine-tuning
	""")

	# Requirements
	"""
	pip install requirements:
	TTS
	fairseq
	torch
	torchaudio
	streamlit
	librosa
	soundfile
	numpy
	wget
	huggingface_hub
	"""