import os import torch import streamlit as st from groq import Groq from diffusers import AutoPipelineForText2Image import tempfile import soundfile as sf # Load API keys GROQ_API_KEY = os.getenv("GROQ_API_KEY") HF_API_KEY = os.getenv("HF_API_KEY") # Initialize Groq client with API key client = Groq(api_key=GROQ_API_KEY) # Load lightweight Hugging Face image generation model image_gen = AutoPipelineForText2Image.from_pretrained( "stabilityai/sdxl-turbo", use_auth_token=HF_API_KEY ) image_gen.to("cuda" if torch.cuda.is_available() else "cpu") # Function to transcribe Tamil audio using Groq's Whisper def transcribe(audio_path): with open(audio_path, "rb") as file: transcription = client.audio.transcriptions.create( file=(audio_path, file.read()), model="whisper-large-v3", language="ta", # Tamil response_format="verbose_json" ) return transcription["text"] # Function to translate Tamil to English using Groq's Gemma def translate_text(tamil_text): response = client.chat.completions.create( model="gemma-7b-it", messages=[{"role": "user", "content": f"Translate this Tamil text to English: {tamil_text}"}] ) return response.choices[0].delta.content # Function to generate text using Groq's DeepSeek R1 def generate_text(prompt): response = client.chat.completions.create( model="deepseek-coder-r1-7b", messages=[{"role": "user", "content": f"Write a short story about: {prompt}"}] ) return response.choices[0].delta.content # Function to generate an image def generate_image(prompt): img = image_gen(prompt=prompt).images[0] return img # Streamlit UI st.title("Tamil Speech to Image & Story Generator") # Audio input - Recording or Uploading st.subheader("Upload or Record Audio") recorded_audio = st.audio("", format='audio/wav', start_time=0) uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "m4a"]) audio_path = None if uploaded_file is not None: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: temp_audio.write(uploaded_file.read()) audio_path = temp_audio.name elif recorded_audio: with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: audio_data, samplerate = sf.read(recorded_audio) sf.write(temp_audio.name, audio_data, samplerate) audio_path = temp_audio.name if st.button("Generate") and audio_path: with st.spinner("Transcribing Tamil speech..."): tamil_text = transcribe(audio_path) with st.spinner("Translating to English..."): english_text = translate_text(tamil_text) with st.spinner("Generating story..."): story = generate_text(english_text) with st.spinner("Generating image..."): image = generate_image(english_text) st.subheader("Tamil Transcription") st.write(tamil_text) st.subheader("English Translation") st.write(english_text) st.subheader("Generated Story") st.write(story) st.subheader("Generated Image") st.image(image)