|
import streamlit as st |
|
import tempfile |
|
import torch |
|
from transformers import pipeline |
|
from diffusers import StableDiffusionPipeline |
|
from pydub import AudioSegment |
|
import base64 |
|
|
|
st.set_page_config(page_title="Tamil Audio to Story & Image", layout="centered") |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") |
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ta-en") |
|
text_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2") |
|
image_gen = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4") |
|
image_gen.to("cuda" if torch.cuda.is_available() else "cpu") |
|
return whisper, translator, text_gen, image_gen |
|
|
|
whisper, translator, text_gen, image_gen = load_models() |
|
|
|
st.title("๐๏ธ Tamil Audio to Story & Image") |
|
st.write("Upload or record Tamil audio to generate English story and image.") |
|
|
|
input_mode = st.radio("Choose Input Mode", ["Upload Audio", "Record Live Audio"]) |
|
|
|
audio_bytes = None |
|
if input_mode == "Upload Audio": |
|
uploaded_file = st.file_uploader("Upload Tamil Audio (.wav, .mp3)", type=["wav", "mp3"], key="upload") |
|
if uploaded_file: |
|
audio_bytes = uploaded_file.read() |
|
else: |
|
audio_recorder = st.audio_recorder("Record your audio", format="audio/wav", key="recorder") |
|
if audio_recorder: |
|
audio_bytes = audio_recorder |
|
|
|
if audio_bytes: |
|
st.audio(audio_bytes, format="audio/wav") |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
|
tmp.write(audio_bytes) |
|
tmp_path = tmp.name |
|
|
|
|
|
if tmp_path.endswith(".mp3"): |
|
sound = AudioSegment.from_mp3(tmp_path) |
|
tmp_path = tmp_path.replace(".mp3", ".wav") |
|
sound.export(tmp_path, format="wav") |
|
|
|
with st.spinner("Transcribing..."): |
|
transcription = whisper(tmp_path)["text"] |
|
st.text_area("Transcribed Tamil Text", transcription) |
|
|
|
with st.spinner("Translating..."): |
|
translation = translator(transcription)[0]['translation_text'] |
|
st.text_area("Translated English Text", translation) |
|
|
|
with st.spinner("Generating Story..."): |
|
story = text_gen(translation, max_length=100)[0]['generated_text'] |
|
st.text_area("Generated Story", story) |
|
|
|
with st.spinner("Generating Image..."): |
|
image = image_gen(prompt=translation).images[0] |
|
st.image(image, caption="Generated Image") |
|
|
|
else: |
|
st.warning("Please upload or record an audio to proceed.") |
|
|