import streamlit as st import tempfile import torch from transformers import pipeline from diffusers import StableDiffusionPipeline from pydub import AudioSegment import base64 st.set_page_config(page_title="Tamil Audio to Story & Image", layout="centered") # Load lightweight models @st.cache_resource def load_models(): whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ta-en") text_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2") image_gen = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") image_gen.to("cuda" if torch.cuda.is_available() else "cpu") return whisper, translator, text_gen, image_gen whisper, translator, text_gen, image_gen = load_models() st.title("🎙️ Tamil Audio to Story & Image") st.write("Upload or record Tamil audio to generate English story and image.") input_mode = st.radio("Choose Input Mode", ["Upload Audio", "Record Live Audio"]) audio_bytes = None if input_mode == "Upload Audio": uploaded_file = st.file_uploader("Upload Tamil Audio (.wav, .mp3)", type=["wav", "mp3"], key="upload") if uploaded_file: audio_bytes = uploaded_file.read() else: audio_recorder = st.audio_recorder("Record your audio", format="audio/wav", key="recorder") if audio_recorder: audio_bytes = audio_recorder if audio_bytes: st.audio(audio_bytes, format="audio/wav") with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(audio_bytes) tmp_path = tmp.name # Convert mp3 to wav if needed if tmp_path.endswith(".mp3"): sound = AudioSegment.from_mp3(tmp_path) tmp_path = tmp_path.replace(".mp3", ".wav") sound.export(tmp_path, format="wav") with st.spinner("Transcribing..."): transcription = whisper(tmp_path)["text"] st.text_area("Transcribed Tamil Text", transcription) with st.spinner("Translating..."): translation = translator(transcription)[0]['translation_text'] st.text_area("Translated English Text", translation) with st.spinner("Generating Story..."): story = text_gen(translation, max_length=100)[0]['generated_text'] st.text_area("Generated Story", story) with st.spinner("Generating Image..."): image = image_gen(prompt=translation).images[0] st.image(image, caption="Generated Image") else: st.warning("Please upload or record an audio to proceed.")