File size: 4,638 Bytes
a24e4a5
 
 
 
 
 
 
 
 
3a938ac
dd83b45
a24e4a5
 
 
 
 
 
 
 
 
 
3a938ac
 
 
a24e4a5
 
3a938ac
 
 
 
 
 
a24e4a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a938ac
a24e4a5
3a938ac
a24e4a5
3a938ac
a24e4a5
 
 
 
 
 
 
 
 
 
 
3a938ac
 
 
a24e4a5
 
 
3a938ac
 
 
 
a24e4a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
import numpy as np
import torch
import os
import time
from scipy.io import wavfile

# Explicitly import Bark components
from bark import generate_audio, SAMPLE_RATE
from bark.generation import preload_models

class VoiceCloningApp:
    def __init__(self):
        # Create working directory
        self.base_dir = os.path.dirname(os.path.abspath(__file__))
        self.working_dir = os.path.join(self.base_dir, "working_files")
        os.makedirs(self.working_dir, exist_ok=True)
        
        # Explicit model loading with error handling
        try:
            print("Attempting to load Bark models...")
            # Remove device argument
            preload_models()
            print("Bark models loaded successfully.")
        except Exception as e:
            print(f"Error loading Bark models: {e}")
            # Log the full error for debugging
            import traceback
            traceback.print_exc()
            
            # Provide a more informative error message
            raise RuntimeError(f"Could not load Bark models. Error: {e}")

    def process_reference_audio(self, audio_data):
        """Simple audio processing"""
        if audio_data is None:
            return "Please provide an audio input"
        
        try:
            # Unpack audio data
            sample_rate, audio_array = audio_data
            
            # Normalize audio
            audio_array = audio_array / np.max(np.abs(audio_array))
            
            # Save reference audio
            filename = f"reference_{int(time.time())}.wav"
            filepath = os.path.join(self.working_dir, filename)
            wavfile.write(filepath, sample_rate, audio_array)
            
            return "βœ… Audio captured successfully!"
        
        except Exception as e:
            return f"Error processing audio: {str(e)}"

    def generate_speech(self, text):
        """Generate speech using Bark"""
        if not text or not text.strip():
            return None, "Please enter some text to speak"
        
        try:
            # Generate audio with explicit error handling
            print(f"Generating speech for text: {text}")
            
            # Simplified audio generation
            audio_array = generate_audio(
                text,
                history_prompt=None,
                temp=0.7
            )
            
            # Save generated audio
            filename = f"generated_speech_{int(time.time())}.wav"
            filepath = os.path.join(self.working_dir, filename)
            wavfile.write(filepath, SAMPLE_RATE, audio_array)
            
            return filepath, None
        
        except Exception as e:
            print(f"Speech generation error: {e}")
            # Log the full error for debugging
            import traceback
            traceback.print_exc()
            return None, f"Error generating speech: {str(e)}"

def create_interface():
    # Create working directory if it doesn't exist
    working_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "working_files")
    os.makedirs(working_dir, exist_ok=True)
    
    app = VoiceCloningApp()
    
    # Use the most basic Gradio theme to avoid font issues
    with gr.Blocks() as interface:
        gr.Markdown("# πŸŽ™οΈ Voice Cloning App")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("## 1. Capture Reference Voice")
                reference_audio = gr.Audio(sources=["microphone", "upload"], type="numpy")
                process_btn = gr.Button("Process Reference Voice")
                process_output = gr.Textbox(label="Processing Result")
                
            with gr.Column():
                gr.Markdown("## 2. Generate Speech")
                text_input = gr.Textbox(label="Enter Text to Speak")
                generate_btn = gr.Button("Generate Speech")
                audio_output = gr.Audio(label="Generated Speech")
                error_output = gr.Textbox(label="Errors", visible=True)
        
        # Bind functions
        process_btn.click(
            fn=app.process_reference_audio, 
            inputs=reference_audio, 
            outputs=process_output
        )
        
        generate_btn.click(
            fn=app.generate_speech, 
            inputs=text_input, 
            outputs=[audio_output, error_output]
        )
    
    return interface

if __name__ == "__main__":
    interface = create_interface()
    interface.launch(
        share=False,
        debug=True,
        show_error=True,
        server_name='0.0.0.0',
        server_port=7860
    )