Spaces:
Running
Running
File size: 4,497 Bytes
8aa8920 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import requests
import base64
import os
from pathlib import Path
import tempfile
import numpy as np
import io
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Maximum allowed input text length
MAX_TEXT_LENGTH = 4000
# Language mapping with display names
LANGUAGES = {
"en": "English",
"hi": "Hindi",
"bn": "Bengali",
"ta": "Tamil",
"te": "Telugu",
"ml": "Malayalam",
"mr": "Marathi",
"gu": "Gujarati",
"kn": "Kannada"
}
def generate_audio(input_text, input_language, input_speaker):
"""
Generate audio from text using Krutrim TTS API
"""
# Check if text exceeds the character limit
if len(input_text) > MAX_TEXT_LENGTH:
return None, f"Error: Input text exceeds the maximum limit of {MAX_TEXT_LENGTH} characters. Your text has {len(input_text)} characters."
# Get API key from environment variable
api_key = os.getenv("KRUTRIM_API_KEY")
if not api_key:
return None, "Error: KRUTRIM_API_KEY environment variable not found. Please check your .env file."
url = "https://cloud.olakrutrim.com/v1/audio/generations/krutrim-tts"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"modelName": "tts",
"input_text": input_text,
"input_language": input_language,
"input_speaker": input_speaker
}
try:
response = requests.post(url, json=payload, headers=headers)
response.raise_for_status() # Raise an error for HTTP failure codes
result = response.json()
base64_audio = result["output"]
audio_data = base64.b64decode(base64_audio)
# Save to a temporary file for download option
temp_dir = Path(tempfile.gettempdir())
output_path = temp_dir / "krutrim_output.wav"
with open(output_path, "wb") as f:
f.write(audio_data)
# Convert to format suitable for browser playback
try:
# Convert WAV data to numpy array for Gradio's audio component
audio_np = np.frombuffer(audio_data, dtype=np.int16)
# Standard sample rate for most audio
sample_rate = 16000 # Adjust this if your API returns a different sample rate
return (sample_rate, audio_np), "Audio generated successfully! Click the play button to listen."
except Exception as e:
# Fallback to file path if conversion fails
return str(output_path), f"Audio generated but playback in browser might not work. You can download the file. Error: {e}"
except requests.exceptions.RequestException as e:
return None, f"Error: {e}"
# Create Gradio interface
with gr.Blocks(title="Krutrim Text-to-Speech") as demo:
gr.Markdown("# Krutrim Text-to-Speech Generator")
gr.Markdown("Enter your text below and get it converted to speech using Krutrim's TTS API.")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(label="Text to convert to speech",
placeholder="Type your text here (maximum 4000 characters)...",
lines=5)
language = gr.Dropdown(label="Language",
choices=list(LANGUAGES.keys()),
value="en")
speaker = gr.Dropdown(label="Speaker",
choices=["male", "female"],
value="male")
submit_btn = gr.Button("Generate Audio")
with gr.Column():
audio_output = gr.Audio(
label="Generated Audio",
type="numpy", # Explicitly set to numpy for better browser compatibility
autoplay=True, # Auto play the audio when generated
show_download_button=True, # Show the download button for the audio
waveform_options={"waveform_color": "blue", "waveform_progress_color": "red"}, # Customize waveform appearance
format="wav" # Specify output format
)
output_message = gr.Textbox(label="Status")
submit_btn.click(
fn=generate_audio,
inputs=[text_input, language, speaker],
outputs=[audio_output, output_message]
)
if __name__ == "__main__":
demo.launch() |