Artificial-superintelligence commited on
Commit
2158d6f
·
verified ·
1 Parent(s): ce9b606

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +157 -160
app.py CHANGED
@@ -1,186 +1,183 @@
1
  import streamlit as st
2
- import librosa
3
- import soundfile as sf
4
- import numpy as np
5
- import scipy.signal as signal
6
- from scipy.io import wavfile
7
- import pyworld as world
8
  import torch
9
  import torchaudio
 
 
 
 
 
 
 
10
  from io import BytesIO
11
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- def enhance_harmonics(y, sr):
14
- # Extract harmonics using harmonic-percussive source separation
15
- y_harmonic = librosa.effects.hpss(y)[0]
16
-
17
- # Enhance the harmonics
18
- y_enhanced = y_harmonic * 1.2 + y * 0.3
19
- return librosa.util.normalize(y_enhanced)
20
-
21
- def modify_formants(y, sr, formant_shift_factor=1.2):
22
- # Get the power spectrum
23
- D = librosa.stft(y)
24
- S = np.abs(D)
25
-
26
- # Estimate formants using LPC
27
- order = 12
28
- a = librosa.lpc(y, order)
29
-
30
- # Shift formants
31
- new_a = np.zeros_like(a)
32
- new_a[0] = a[0]
33
- for i in range(1, len(a)):
34
- new_a[i] = a[i] * (formant_shift_factor ** i)
35
-
36
- # Apply modified LPC filter
37
- y_formant = signal.lfilter([1], new_a, y)
38
- return librosa.util.normalize(y_formant)
39
-
40
- def process_audio_advanced(audio_file, settings):
41
- # Load audio
42
- y, sr = librosa.load(audio_file)
43
-
44
- # Extract F0 and spectral envelope using WORLD vocoder
45
- _f0, t = librosa.piptrack(y=y, sr=sr)
46
- f0 = np.mean(_f0[_f0 > 0], axis=0)
47
-
48
- # Pitch shifting with formant preservation
49
- y_shifted = librosa.effects.pitch_shift(
50
- y,
51
- sr=sr,
52
- n_steps=settings['pitch_shift']
53
- )
54
-
55
- # Modify formants
56
- y_formant = modify_formants(
57
- y_shifted,
58
- sr,
59
- settings['formant_shift']
60
- )
61
-
62
- # Enhance harmonics
63
- y_harmonic = enhance_harmonics(y_formant, sr)
64
-
65
- # Apply vocal tract length normalization
66
- y_vtln = librosa.effects.time_stretch(
67
- y_harmonic,
68
- rate=settings['vtln_factor']
69
- )
70
-
71
- # Smooth the output
72
- y_smooth = signal.savgol_filter(y_vtln, 1001, 2)
73
-
74
- # Final normalization
75
- y_final = librosa.util.normalize(y_smooth)
76
-
77
- return y_final, sr
78
-
79
- def create_voice_preset(preset_name):
80
- presets = {
81
- 'Young Female': {
82
- 'pitch_shift': 8.0,
83
- 'formant_shift': 1.3,
84
- 'vtln_factor': 1.1,
85
- 'breathiness': 0.3
86
- },
87
- 'Mature Female': {
88
- 'pitch_shift': 6.0,
89
- 'formant_shift': 1.2,
90
- 'vtln_factor': 1.05,
91
- 'breathiness': 0.2
92
- },
93
- 'Soft Female': {
94
- 'pitch_shift': 7.0,
95
- 'formant_shift': 1.25,
96
- 'vtln_factor': 1.15,
97
- 'breathiness': 0.4
98
- }
99
- }
100
- return presets.get(preset_name)
101
-
102
- def add_breathiness(y, sr, amount=0.3):
103
- # Generate breath noise
104
- noise = np.random.normal(0, 0.01, len(y))
105
- noise_filtered = signal.lfilter([1], [1, -0.98], noise)
106
-
107
- # Mix with original signal
108
- y_breathy = y * (1 - amount) + noise_filtered * amount
109
- return librosa.util.normalize(y_breathy)
110
-
111
- st.title("Advanced Female Voice Converter")
112
-
113
- # File uploader
114
  uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
115
 
116
  if uploaded_file is not None:
 
 
 
117
  # Save uploaded file temporarily
118
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
119
  tmp_file.write(uploaded_file.getvalue())
120
  tmp_path = tmp_file.name
121
 
122
- # Voice preset selector
123
- preset_name = st.selectbox(
124
- "Select Voice Preset",
125
- ['Young Female', 'Mature Female', 'Soft Female', 'Custom']
126
- )
127
-
128
- if preset_name == 'Custom':
129
- settings = {
130
- 'pitch_shift': st.slider("Pitch Shift", 0.0, 12.0, 8.0, 0.5),
131
- 'formant_shift': st.slider("Formant Shift", 1.0, 1.5, 1.2, 0.05),
132
- 'vtln_factor': st.slider("Vocal Tract Length", 0.9, 1.2, 1.1, 0.05),
133
- 'breathiness': st.slider("Breathiness", 0.0, 1.0, 0.3, 0.1)
134
- }
135
- else:
136
- settings = create_voice_preset(preset_name)
137
-
138
  if st.button("Convert Voice"):
139
- with st.spinner("Processing audio..."):
140
- try:
141
- # Process audio
142
- processed_audio, sr = process_audio_advanced(tmp_path, settings)
143
-
144
- # Add breathiness
145
- processed_audio = add_breathiness(
146
- processed_audio,
147
- sr,
148
- settings['breathiness']
 
 
 
 
 
149
  )
150
-
151
- # Save to buffer
152
- buffer = BytesIO()
153
- sf.write(buffer, processed_audio, sr, format='WAV')
154
-
155
  # Display audio player
156
- st.audio(buffer, format='audio/wav')
157
-
158
  # Download button
159
  st.download_button(
160
  label="Download Converted Audio",
161
- data=buffer,
162
- file_name="female_voice_converted.wav",
163
  mime="audio/wav"
164
  )
165
-
166
- except Exception as e:
167
- st.error(f"Error processing audio: {str(e)}")
168
 
 
 
 
 
169
  st.markdown("""
170
- ### Advanced Features:
171
- - Formant preservation and shifting
172
- - Harmonic enhancement
173
- - Vocal tract length normalization
174
- - Natural breathiness addition
175
- - Multiple voice presets
176
- - Custom parameter adjustment
 
 
 
177
 
178
  ### Tips for Best Results:
179
- 1. Use high-quality input audio
180
- 2. Start with presets and adjust if needed
181
- 3. For custom settings:
182
- - Pitch shift: 6-8 for natural female voice
183
- - Formant shift: 1.1-1.3 for feminine resonance
184
- - Vocal tract length: 1.05-1.15 for realistic results
185
- - Breathiness: 0.2-0.4 for natural sound
186
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
2
  import torch
3
  import torchaudio
4
+ import numpy as np
5
+ import librosa
6
+ import soundfile as sf
7
+ from TTS.api import TTS
8
+ from fairseq import checkpoint_utils
9
+ import wget
10
+ import os
11
  from io import BytesIO
12
  import tempfile
13
+ import huggingface_hub
14
+
15
+ class VoiceConverter:
16
+ def __init__(self):
17
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ self.load_models()
19
+
20
+ def load_models(self):
21
+ # Download pre-trained models if not exists
22
+ models_dir = "pretrained_models"
23
+ os.makedirs(models_dir, exist_ok=True)
24
+
25
+ # Load Coqui TTS model
26
+ self.tts = TTS("tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
27
+
28
+ # Load VITS model
29
+ vits_path = os.path.join(models_dir, "vits_female.pth")
30
+ if not os.path.exists(vits_path):
31
+ # Download VITS pre-trained model
32
+ wget.download(
33
+ "https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/G_953000.pth",
34
+ vits_path
35
+ )
36
+
37
+ self.vits_model = torch.load(vits_path, map_location=self.device)
38
+ self.vits_model.eval()
39
+
40
+ def convert_voice(self, audio_path, speaker_id=1, emotion="Happy"):
41
+ # Load audio
42
+ wav, sr = librosa.load(audio_path)
43
+
44
+ # Resample if needed
45
+ if sr != 22050:
46
+ wav = librosa.resample(wav, orig_sr=sr, target_sr=22050)
47
+ sr = 22050
48
+
49
+ # Convert to tensor
50
+ wav_tensor = torch.FloatTensor(wav).unsqueeze(0).to(self.device)
51
+
52
+ # Process with VITS
53
+ with torch.no_grad():
54
+ converted = self.vits_model.voice_conversion(
55
+ wav_tensor,
56
+ speaker_id=speaker_id
57
+ )
58
+
59
+ # Process with Coqui TTS for emotion
60
+ wav_path = "temp.wav"
61
+ sf.write(wav_path, converted.cpu().numpy(), sr)
62
+
63
+ emotional_wav = self.tts.tts_with_vc(
64
+ wav_path,
65
+ speaker_wav=wav_path,
66
+ emotion=emotion
67
+ )
68
+
69
+ return emotional_wav, sr
70
 
71
+ def save_audio(audio_data, sr):
72
+ buffer = BytesIO()
73
+ sf.write(buffer, audio_data, sr, format='WAV')
74
+ return buffer
75
+
76
+ # Streamlit Interface
77
+ st.title("AI Voice Converter - Female Voice Transformation")
78
+
79
+ # Model selection
80
+ model_type = st.selectbox(
81
+ "Select Voice Model",
82
+ ["VITS Female", "YourTTS Female", "Mixed Model"]
83
+ )
84
+
85
+ # Voice character selection
86
+ voice_character = st.selectbox(
87
+ "Select Voice Character",
88
+ ["Anime Female", "Natural Female", "Young Female", "Mature Female"]
89
+ )
90
+
91
+ # Emotion selection
92
+ emotion = st.selectbox(
93
+ "Select Emotion",
94
+ ["Happy", "Sad", "Angry", "Neutral", "Excited"]
95
+ )
96
+
97
+ # Additional parameters
98
+ with st.expander("Advanced Settings"):
99
+ pitch_adjust = st.slider("Pitch Adjustment", -10, 10, 0)
100
+ clarity = st.slider("Voice Clarity", 0.0, 1.0, 0.8)
101
+ speed = st.slider("Speaking Speed", 0.5, 2.0, 1.0)
102
+
103
+ # File upload
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  uploaded_file = st.file_uploader("Upload an audio file", type=['wav', 'mp3'])
105
 
106
  if uploaded_file is not None:
107
+ # Initialize converter
108
+ converter = VoiceConverter()
109
+
110
  # Save uploaded file temporarily
111
  with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
112
  tmp_file.write(uploaded_file.getvalue())
113
  tmp_path = tmp_file.name
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  if st.button("Convert Voice"):
116
+ try:
117
+ with st.spinner("Converting voice... This may take a few moments."):
118
+ # Get speaker ID based on voice character
119
+ speaker_id = {
120
+ "Anime Female": 0,
121
+ "Natural Female": 1,
122
+ "Young Female": 2,
123
+ "Mature Female": 3
124
+ }[voice_character]
125
+
126
+ # Convert voice
127
+ converted_audio, sr = converter.convert_voice(
128
+ tmp_path,
129
+ speaker_id=speaker_id,
130
+ emotion=emotion
131
  )
132
+
133
+ # Create audio buffer
134
+ audio_buffer = save_audio(converted_audio, sr)
135
+
 
136
  # Display audio player
137
+ st.audio(audio_buffer, format='audio/wav')
138
+
139
  # Download button
140
  st.download_button(
141
  label="Download Converted Audio",
142
+ data=audio_buffer,
143
+ file_name="ai_converted_voice.wav",
144
  mime="audio/wav"
145
  )
 
 
 
146
 
147
+ except Exception as e:
148
+ st.error(f"Error during conversion: {str(e)}")
149
+
150
+ # Add information about the models
151
  st.markdown("""
152
+ ### Model Information:
153
+ 1. **VITS Female**: Pre-trained on a large dataset of female voices
154
+ 2. **YourTTS**: Multi-speaker, multi-lingual voice conversion model
155
+ 3. **Mixed Model**: Combination of multiple models for better quality
156
+
157
+ ### Voice Characters:
158
+ - **Anime Female**: High-pitched, animated style voice
159
+ - **Natural Female**: Realistic female voice
160
+ - **Young Female**: Young adult female voice
161
+ - **Mature Female**: Mature female voice
162
 
163
  ### Tips for Best Results:
164
+ - Use clear audio input with minimal background noise
165
+ - Short audio clips (5-30 seconds) work best
166
+ - Experiment with different emotions and voice characters
167
+ - Adjust advanced settings for fine-tuning
168
+ """)
169
+
170
+ # Requirements
171
+ """
172
+ pip install requirements:
173
+ TTS
174
+ fairseq
175
+ torch
176
+ torchaudio
177
+ streamlit
178
+ librosa
179
+ soundfile
180
+ numpy
181
+ wget
182
+ huggingface_hub
183
+ """