Spaces:

Kishorekumar7
/

Voice_to_Text_and_Image

Sleeping

File size: 2,545 Bytes

4a997af
f182d83
4dffac9
f182d83
9c054fd
f182d83
 
9c054fd
f182d83
b0b5043
f182d83
9c054fd
 
f182d83
 
 
 
 
 
9c054fd
f182d83
b0b5043
f182d83
 
9c054fd
f182d83
9c054fd
f182d83
 
 
 
 
9c054fd
f182d83
 
 
9c054fd
f182d83
 
9c054fd
f182d83
 
 
9c054fd
f182d83
 
 
 
 
9c054fd
f182d83
 
 
60b0b0c
f182d83
 
 
9c054fd
f182d83
 
 
60b0b0c
f182d83
 
 
9c054fd
f182d83

import streamlit as st
import tempfile
import torch
from transformers import pipeline
from diffusers import StableDiffusionPipeline
from pydub import AudioSegment
import base64

st.set_page_config(page_title="Tamil Audio to Story & Image", layout="centered")

# Load lightweight models
@st.cache_resource
def load_models():
    whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ta-en")
    text_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2")
    image_gen = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
    image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
    return whisper, translator, text_gen, image_gen

whisper, translator, text_gen, image_gen = load_models()

st.title("🎙️ Tamil Audio to Story & Image")
st.write("Upload or record Tamil audio to generate English story and image.")

input_mode = st.radio("Choose Input Mode", ["Upload Audio", "Record Live Audio"])

audio_bytes = None
if input_mode == "Upload Audio":
    uploaded_file = st.file_uploader("Upload Tamil Audio (.wav, .mp3)", type=["wav", "mp3"], key="upload")
    if uploaded_file:
        audio_bytes = uploaded_file.read()
else:
    audio_recorder = st.audio_recorder("Record your audio", format="audio/wav", key="recorder")
    if audio_recorder:
        audio_bytes = audio_recorder

if audio_bytes:
    st.audio(audio_bytes, format="audio/wav")

    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(audio_bytes)
        tmp_path = tmp.name

    # Convert mp3 to wav if needed
    if tmp_path.endswith(".mp3"):
        sound = AudioSegment.from_mp3(tmp_path)
        tmp_path = tmp_path.replace(".mp3", ".wav")
        sound.export(tmp_path, format="wav")

    with st.spinner("Transcribing..."):
        transcription = whisper(tmp_path)["text"]
        st.text_area("Transcribed Tamil Text", transcription)

    with st.spinner("Translating..."):
        translation = translator(transcription)[0]['translation_text']
        st.text_area("Translated English Text", translation)

    with st.spinner("Generating Story..."):
        story = text_gen(translation, max_length=100)[0]['generated_text']
        st.text_area("Generated Story", story)

    with st.spinner("Generating Image..."):
        image = image_gen(prompt=translation).images[0]
        st.image(image, caption="Generated Image")

else:
    st.warning("Please upload or record an audio to proceed.")