sablab commited on
Commit
950142f
·
verified ·
1 Parent(s): 5cefbe6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -0
app.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import VitsModel, VitsTokenizer
4
+
5
+ # --- 1. Load Model and Tokenizer ---
6
+ # Load the pretrained model and tokenizer from Hugging Face.
7
+ # This is done once when the app starts, not for every prediction.
8
+ print("Loading F5-TTS model and tokenizer...")
9
+ model = VitsModel.from_pretrained("SWivid/F5-TTS")
10
+ tokenizer = VitsTokenizer.from_pretrained("SWivid/F5-TTS")
11
+ print("Model and tokenizer loaded successfully.")
12
+
13
+ # --- 2. Define the Speech Synthesis Function ---
14
+ def synthesize_speech(text):
15
+ """
16
+ Converts text to speech using the F5-TTS model.
17
+ """
18
+ # Tokenize the input text. The `return_tensors="pt"` part formats it for PyTorch.
19
+ inputs = tokenizer(text, return_tensors="pt")
20
+
21
+ # Generate the audio waveform.
22
+ # We use torch.no_grad() to speed up inference as we aren't training the model.
23
+ with torch.no_grad():
24
+ waveform = model(**inputs).waveform
25
+
26
+ # The output is a PyTorch tensor. Convert it to a NumPy array.
27
+ # .squeeze() removes any extra single dimensions.
28
+ waveform_numpy = waveform.cpu().numpy().squeeze()
29
+
30
+ # Get the sampling rate from the model's configuration.
31
+ sampling_rate = model.config.sampling_rate
32
+
33
+ # Return the sampling rate and waveform as a tuple for the Gradio Audio component.
34
+ return (sampling_rate, waveform_numpy)
35
+
36
+ # --- 3. Build the Gradio Interface ---
37
+ demo = gr.Interface(
38
+ fn=synthesize_speech,
39
+ inputs=gr.Textbox(
40
+ label="Text to Synthesize",
41
+ info="Enter the text you want to convert to speech.",
42
+ value="Hello, this is a demonstration of the F5 text to speech model."
43
+ ),
44
+ outputs=gr.Audio(
45
+ label="Synthesized Audio",
46
+ type="numpy" # The function returns a NumPy array
47
+ ),
48
+ title="🗣️ F5-TTS Text-to-Speech",
49
+ description="A simple Gradio app to run the `SWivid/F5-TTS` model for text-to-speech conversion. Built by Gemini.",
50
+ examples=[
51
+ ["The quick brown fox jumps over the lazy dog."],
52
+ ["To be, or not to be, that is the question."],
53
+ ["Artificial intelligence will shape our future in profound ways."]
54
+ ],
55
+ cache_examples=True # Cache results for faster demo
56
+ )
57
+
58
+ # --- 4. Launch the App ---
59
+ if __name__ == "__main__":
60
+ demo.launch()