NLPV commited on
Commit
7001f7d
ยท
verified ยท
1 Parent(s): c93e5ae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from parler_tts import ParlerTTSForConditionalGeneration
3
+ from transformers import AutoTokenizer
4
+ import gradio as gr
5
+ import numpy as np
6
+
7
+ # Set device to GPU if available, else CPU
8
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
+
10
+ # Load the TTS model and tokenizers
11
+ model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
12
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
13
+ description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
14
+
15
+ def generate_audio(prompt: str, description: str):
16
+ """
17
+ Generate synthesized speech audio based on the input prompt and description.
18
+
19
+ Args:
20
+ prompt (str): The text prompt to be spoken.
21
+ description (str): A description to guide the voice characteristics.
22
+
23
+ Returns:
24
+ tuple: A tuple containing the audio numpy array and the sampling rate.
25
+ """
26
+ # Tokenize inputs for the description and prompt
27
+ description_tokens = description_tokenizer(description, return_tensors="pt").to(device)
28
+ prompt_tokens = tokenizer(prompt, return_tensors="pt").to(device)
29
+
30
+ # Generate the audio tensor using the model
31
+ generation = model.generate(
32
+ input_ids=description_tokens.input_ids,
33
+ attention_mask=description_tokens.attention_mask,
34
+ prompt_input_ids=prompt_tokens.input_ids,
35
+ prompt_attention_mask=prompt_tokens.attention_mask
36
+ )
37
+
38
+ # Convert the generated tensor to a numpy array and remove extra dimensions
39
+ audio_arr = generation.cpu().numpy().squeeze()
40
+
41
+ # Retrieve the sampling rate from the model config
42
+ sampling_rate = model.config.sampling_rate
43
+ return (audio_arr, sampling_rate)
44
+
45
+ # Build the Gradio interface
46
+ iface = gr.Interface(
47
+ fn=generate_audio,
48
+ inputs=[
49
+ gr.Textbox(label="Prompt", value="เค…เคฐเฅ‡, เคคเฅเคฎ เค†เคœ เค•เฅˆเคธเฅ‡ เคนเฅ‹?"),
50
+ gr.Textbox(label="Description", value="Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.")
51
+ ],
52
+ outputs=gr.Audio(label="Generated Audio"),
53
+ title="Indic Parler TTS",
54
+ description="Generate synthesized speech using the Indic Parler TTS model from ai4bharat."
55
+ )
56
+
57
+ if __name__ == "__main__":
58
+ iface.launch()