Kishorekumar7's picture
Update app.py
60b0b0c verified
raw
history blame
3.12 kB
import os
import torch
import streamlit as st
from groq import Groq
from diffusers import AutoPipelineForText2Image
import tempfile
import soundfile as sf
# Load API keys
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HF_API_KEY = os.getenv("HF_API_KEY")
# Initialize Groq client with API key
client = Groq(api_key=GROQ_API_KEY)
# Load lightweight Hugging Face image generation model
image_gen = AutoPipelineForText2Image.from_pretrained(
"stabilityai/sdxl-turbo", use_auth_token=HF_API_KEY
)
image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
# Function to transcribe Tamil audio using Groq's Whisper
def transcribe(audio_path):
with open(audio_path, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(audio_path, file.read()),
model="whisper-large-v3",
language="ta", # Tamil
response_format="verbose_json"
)
return transcription["text"]
# Function to translate Tamil to English using Groq's Gemma
def translate_text(tamil_text):
response = client.chat.completions.create(
model="gemma-7b-it",
messages=[{"role": "user", "content": f"Translate this Tamil text to English: {tamil_text}"}]
)
return response.choices[0].delta.content
# Function to generate text using Groq's DeepSeek R1
def generate_text(prompt):
response = client.chat.completions.create(
model="deepseek-coder-r1-7b",
messages=[{"role": "user", "content": f"Write a short story about: {prompt}"}]
)
return response.choices[0].delta.content
# Function to generate an image
def generate_image(prompt):
img = image_gen(prompt=prompt).images[0]
return img
# Streamlit UI
st.title("Tamil Speech to Image & Story Generator")
# Audio input - Recording or Uploading
st.subheader("Upload or Record Audio")
recorded_audio = st.audio("", format='audio/wav', start_time=0)
uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "m4a"])
audio_path = None
if uploaded_file is not None:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
temp_audio.write(uploaded_file.read())
audio_path = temp_audio.name
elif recorded_audio:
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
audio_data, samplerate = sf.read(recorded_audio)
sf.write(temp_audio.name, audio_data, samplerate)
audio_path = temp_audio.name
if st.button("Generate") and audio_path:
with st.spinner("Transcribing Tamil speech..."):
tamil_text = transcribe(audio_path)
with st.spinner("Translating to English..."):
english_text = translate_text(tamil_text)
with st.spinner("Generating story..."):
story = generate_text(english_text)
with st.spinner("Generating image..."):
image = generate_image(english_text)
st.subheader("Tamil Transcription")
st.write(tamil_text)
st.subheader("English Translation")
st.write(english_text)
st.subheader("Generated Story")
st.write(story)
st.subheader("Generated Image")
st.image(image)