import streamlit as st
import tempfile
import torch
from transformers import pipeline
from diffusers import StableDiffusionPipeline
from pydub import AudioSegment
import base64

st.set_page_config(page_title="Tamil Audio to Story & Image", layout="centered")

# Load lightweight models
@st.cache_resource
def load_models():
    whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ta-en")
    text_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2")
    image_gen = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
    image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
    return whisper, translator, text_gen, image_gen

whisper, translator, text_gen, image_gen = load_models()

st.title("🎙️ Tamil Audio to Story & Image")
st.write("Upload or record Tamil audio to generate English story and image.")

input_mode = st.radio("Choose Input Mode", ["Upload Audio", "Record Live Audio"])

audio_bytes = None
if input_mode == "Upload Audio":
    uploaded_file = st.file_uploader("Upload Tamil Audio (.wav, .mp3)", type=["wav", "mp3"], key="upload")
    if uploaded_file:
        audio_bytes = uploaded_file.read()
else:
    audio_recorder = st.audio_recorder("Record your audio", format="audio/wav", key="recorder")
    if audio_recorder:
        audio_bytes = audio_recorder

if audio_bytes:
    st.audio(audio_bytes, format="audio/wav")

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(audio_bytes)
        tmp_path = tmp.name

    # Convert mp3 to wav if needed
    if tmp_path.endswith(".mp3"):
        sound = AudioSegment.from_mp3(tmp_path)
        tmp_path = tmp_path.replace(".mp3", ".wav")
        sound.export(tmp_path, format="wav")

    with st.spinner("Transcribing..."):
        transcription = whisper(tmp_path)["text"]
        st.text_area("Transcribed Tamil Text", transcription)

    with st.spinner("Translating..."):
        translation = translator(transcription)[0]['translation_text']
        st.text_area("Translated English Text", translation)

    with st.spinner("Generating Story..."):
        story = text_gen(translation, max_length=100)[0]['generated_text']
        st.text_area("Generated Story", story)

    with st.spinner("Generating Image..."):
        image = image_gen(prompt=translation).images[0]
        st.image(image, caption="Generated Image")

else:
    st.warning("Please upload or record an audio to proceed.")