Spaces:

NLPV
/

Maithli_TTS

Runtime error

App Files Files Community

NLPV commited on Mar 23

Commit

7001f7d

verified ·

1 Parent(s): c93e5ae

Create app.py

Browse files

Files changed (1) hide show

app.py +58 -0

app.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import torch
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+import gradio as gr
+import numpy as np
+# Set device to GPU if available, else CPU
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+# Load the TTS model and tokenizers
+model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
+tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
+description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
+def generate_audio(prompt: str, description: str):
+    """
+    Generate synthesized speech audio based on the input prompt and description.
+    Args:
+        prompt (str): The text prompt to be spoken.
+        description (str): A description to guide the voice characteristics.
+    Returns:
+        tuple: A tuple containing the audio numpy array and the sampling rate.
+    """
+    # Tokenize inputs for the description and prompt
+    description_tokens = description_tokenizer(description, return_tensors="pt").to(device)
+    prompt_tokens = tokenizer(prompt, return_tensors="pt").to(device)
+    # Generate the audio tensor using the model
+    generation = model.generate(
+        input_ids=description_tokens.input_ids,
+        attention_mask=description_tokens.attention_mask,
+        prompt_input_ids=prompt_tokens.input_ids,
+        prompt_attention_mask=prompt_tokens.attention_mask
+    )
+    # Convert the generated tensor to a numpy array and remove extra dimensions
+    audio_arr = generation.cpu().numpy().squeeze()
+    # Retrieve the sampling rate from the model config
+    sampling_rate = model.config.sampling_rate
+    return (audio_arr, sampling_rate)
+# Build the Gradio interface
+iface = gr.Interface(
+    fn=generate_audio,
+    inputs=[
+        gr.Textbox(label="Prompt", value="अरे, तुम आज कैसे हो?"),
+        gr.Textbox(label="Description", value="Divya's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.")
+    ],
+    outputs=gr.Audio(label="Generated Audio"),
+    title="Indic Parler TTS",
+    description="Generate synthesized speech using the Indic Parler TTS model from ai4bharat."
+)
+if __name__ == "__main__":
+    iface.launch()