Spaces:

Kishorekumar7
/

Voice_to_Text_and_Image

Sleeping

App Files Files Community

Kishorekumar7 commited on Apr 4

Commit

f182d83

verified ·

1 Parent(s): 9c054fd

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -74

app.py CHANGED Viewed

@@ -1,94 +1,68 @@
 import streamlit as st
-import torchaudio
 import torch
-from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
 from diffusers import StableDiffusionPipeline
-from io import BytesIO
-import tempfile
-import os
-st.set_page_config(page_title="Tamil Voice to Story & Image Generator", layout="wide")
-st.title("🎤 Tamil Voice to Story & Image Generator")
-# Load models only once
 @st.cache_resource
 def load_models():
-    # 1. Whisper small for speech recognition
-    whisper_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=0 if torch.cuda.is_available() else -1)
-    # 2. NLLB for Tamil to English translation
-    tokenizer_trans = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-    model_trans = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
-    # 3. Tiny Story Generator
-    story_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2", device=0 if torch.cuda.is_available() else -1)
-    # 4. Image Generator
-    image_pipe = StableDiffusionPipeline.from_pretrained(
-        "CompVis/stable-diffusion-v1-4",
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
-    )
-    if torch.cuda.is_available():
-        image_pipe.to("cuda")
-    return whisper_pipe, tokenizer_trans, model_trans, story_gen, image_pipe
-whisper_pipe, tokenizer_trans, model_trans, story_gen, image_pipe = load_models()
-# Function: Translate Tamil to English
-def translate_ta_to_en(text):
-    inputs = tokenizer_trans(text, return_tensors="pt", padding=True)
-    translated = model_trans.generate(**inputs, forced_bos_token_id=tokenizer_trans.lang_code_to_id["eng_Latn"])
-    return tokenizer_trans.batch_decode(translated, skip_special_tokens=True)[0]
-# Function: Generate story
-def generate_story(prompt):
-    story = story_gen(prompt, max_length=100, num_return_sequences=1)
-    return story[0]['generated_text']
-# Function: Generate image
-def generate_image(prompt):
-    image = image_pipe(prompt).images[0]
-    return image
-# Upload or Record
-input_method = st.radio("Select Input Method", ["Upload Audio", "Record Live"])
-if input_method == "Upload Audio":
-    audio_file = st.file_uploader("Upload Tamil Audio", type=["wav", "mp3", "m4a"])
 else:
-    audio_bytes = st.audio("Record or Upload Audio Below", format='audio/wav')
-    audio_file = None
-    if audio_bytes:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
-            tmpfile.write(audio_bytes.read())
-            audio_file = tmpfile.name
-# Process Button
-if st.button("Generate from Audio") and audio_file:
-    with st.spinner("🔄 Transcribing Tamil audio..."):
-        result = whisper_pipe(audio_file)
-        tamil_text = result['text']
-    st.success("✅ Tamil Transcription")
-    st.write(tamil_text)
-    with st.spinner("🌐 Translating to English..."):
-        english_text = translate_ta_to_en(tamil_text)
-    st.success("✅ English Translation")
-    st.write(english_text)
-    with st.spinner("✍️ Generating Story..."):
-        story = generate_story(english_text)
-    st.success("✅ Story Generated")
-    st.write(story)
-    with st.spinner("🎨 Generating Image..."):
-        image = generate_image(english_text)
-    st.image(image, caption="Generated Image")
-elif st.button("Generate from Audio") and not audio_file:
-    st.warning("Please upload or record an audio file.")

 import streamlit as st
+import tempfile
 import torch
+from transformers import pipeline
 from diffusers import StableDiffusionPipeline
+from pydub import AudioSegment
+import base64
+st.set_page_config(page_title="Tamil Audio to Story & Image", layout="centered")
+# Load lightweight models
 @st.cache_resource
 def load_models():
+    whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
+    translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ta-en")
+    text_gen = pipeline("text-generation", model="sshleifer/tiny-gpt2")
+    image_gen = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+    image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
+    return whisper, translator, text_gen, image_gen
+whisper, translator, text_gen, image_gen = load_models()
+st.title("🎙️ Tamil Audio to Story & Image")
+st.write("Upload or record Tamil audio to generate English story and image.")
+input_mode = st.radio("Choose Input Mode", ["Upload Audio", "Record Live Audio"])
+audio_bytes = None
+if input_mode == "Upload Audio":
+    uploaded_file = st.file_uploader("Upload Tamil Audio (.wav, .mp3)", type=["wav", "mp3"], key="upload")
+    if uploaded_file:
+        audio_bytes = uploaded_file.read()
 else:
+    audio_recorder = st.audio_recorder("Record your audio", format="audio/wav", key="recorder")
+    if audio_recorder:
+        audio_bytes = audio_recorder
+if audio_bytes:
+    st.audio(audio_bytes, format="audio/wav")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+        tmp.write(audio_bytes)
+        tmp_path = tmp.name
+    # Convert mp3 to wav if needed
+    if tmp_path.endswith(".mp3"):
+        sound = AudioSegment.from_mp3(tmp_path)
+        tmp_path = tmp_path.replace(".mp3", ".wav")
+        sound.export(tmp_path, format="wav")
+    with st.spinner("Transcribing..."):
+        transcription = whisper(tmp_path)["text"]
+        st.text_area("Transcribed Tamil Text", transcription)
+    with st.spinner("Translating..."):
+        translation = translator(transcription)[0]['translation_text']
+        st.text_area("Translated English Text", translation)
+    with st.spinner("Generating Story..."):
+        story = text_gen(translation, max_length=100)[0]['generated_text']
+        st.text_area("Generated Story", story)
+    with st.spinner("Generating Image..."):
+        image = image_gen(prompt=translation).images[0]
+        st.image(image, caption="Generated Image")
+else:
+    st.warning("Please upload or record an audio to proceed.")