Spaces:

Kishorekumar7
/

Voice_to_Text_and_Image

Sleeping

App Files Files Community

Voice_to_Text_and_Image / app.py

Kishorekumar7

Update app.py

f182d83 verified 3 months ago

raw

history blame

2.55 kB

	import streamlit as st
	import tempfile
	import torch
	from transformers import pipeline
	from diffusers import StableDiffusionPipeline
	from pydub import AudioSegment
	import base64

	st.set_page_config(page_title="Tamil Audio to Story & Image", layout="centered")

	# Load lightweight models
	@st.cache_resource
	def load_models():
	whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
	translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ta-en")
	text_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2")
	image_gen = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
	image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
	return whisper, translator, text_gen, image_gen

	whisper, translator, text_gen, image_gen = load_models()

	st.title("🎙️ Tamil Audio to Story & Image")
	st.write("Upload or record Tamil audio to generate English story and image.")

	input_mode = st.radio("Choose Input Mode", ["Upload Audio", "Record Live Audio"])

	audio_bytes = None
	if input_mode == "Upload Audio":
	uploaded_file = st.file_uploader("Upload Tamil Audio (.wav, .mp3)", type=["wav", "mp3"], key="upload")
	if uploaded_file:
	audio_bytes = uploaded_file.read()
	else:
	audio_recorder = st.audio_recorder("Record your audio", format="audio/wav", key="recorder")
	if audio_recorder:
	audio_bytes = audio_recorder

	if audio_bytes:
	st.audio(audio_bytes, format="audio/wav")

	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(audio_bytes)
	tmp_path = tmp.name

	# Convert mp3 to wav if needed
	if tmp_path.endswith(".mp3"):
	sound = AudioSegment.from_mp3(tmp_path)
	tmp_path = tmp_path.replace(".mp3", ".wav")
	sound.export(tmp_path, format="wav")

	with st.spinner("Transcribing..."):
	transcription = whisper(tmp_path)["text"]
	st.text_area("Transcribed Tamil Text", transcription)

	with st.spinner("Translating..."):
	translation = translator(transcription)[0]['translation_text']
	st.text_area("Translated English Text", translation)

	with st.spinner("Generating Story..."):
	story = text_gen(translation, max_length=100)[0]['generated_text']
	st.text_area("Generated Story", story)

	with st.spinner("Generating Image..."):
	image = image_gen(prompt=translation).images[0]
	st.image(image, caption="Generated Image")

	else:
	st.warning("Please upload or record an audio to proceed.")