File size: 3,043 Bytes
4a997af 5ce3342 4dffac9 5ce3342 b0b5043 4dffac9 b0b5043 5ce3342 b0b5043 5ce3342 b0b5043 5ce3342 60b0b0c b0b5043 60b0b0c b0b5043 4dffac9 b0b5043 5ce3342 b0b5043 5ce3342 b0b5043 5ce3342 b0b5043 5ce3342 b0b5043 4a997af b0b5043 5ce3342 60b0b0c 4dffac9 5ce3342 4dffac9 5ce3342 4dffac9 5ce3342 4dffac9 5ce3342 60b0b0c 5ce3342 60b0b0c 5ce3342 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import streamlit as st
import os
import torch
import soundfile as sf
from groq import Groq
from diffusers import AutoPipelineForText2Image
from streamlit_webrtc import webrtc_streamer, AudioRecorder
# Load API keys
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HF_API_KEY = os.getenv("HF_API_KEY")
# Initialize Groq client
client = Groq(api_key=GROQ_API_KEY)
# Load image generation model
device = "cuda" if torch.cuda.is_available() else "cpu"
image_gen = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo").to(device)
# Function to transcribe audio
def transcribe(audio_path):
with open(audio_path, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(audio_path, file.read()),
model="whisper-large-v3",
language="ta",
response_format="verbose_json"
)
return transcription["text"]
# Function to translate Tamil to English
def translate_text(tamil_text):
response = client.chat.completions.create(
model="gemma-7b-it",
messages=[{"role": "user", "content": f"Translate this Tamil text to English: {tamil_text}"}]
)
return response.choices[0].message.content
# Function to generate text
def generate_text(prompt):
response = client.chat.completions.create(
model="deepseek-coder-r1-7b",
messages=[{"role": "user", "content": f"Write a short story about: {prompt}"}]
)
return response.choices[0].message.content
# Function to generate an image
def generate_image(prompt):
img = image_gen(prompt=prompt).images[0]
return img
# Streamlit UI
st.title("Tamil Speech to Image & Story Generator")
# Choose input method
input_method = st.radio("Choose Input Method:", ("Record Audio", "Upload Audio"))
audio_path = None
if input_method == "Record Audio":
st.subheader("Record your Tamil speech")
recorder = webrtc_streamer(key="record_audio", audio=True)
if recorder.audio_receiver:
audio_data = recorder.audio_receiver.get_frames() # Get recorded audio
audio_path = "recorded_audio.wav"
sf.write(audio_path, audio_data, 16000) # Save recorded audio
elif input_method == "Upload Audio":
uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
if uploaded_file:
audio_path = "uploaded_audio.wav"
with open(audio_path, "wb") as f:
f.write(uploaded_file.getbuffer())
if st.button("Generate"):
if not audio_path:
st.error("Please provide an audio file.")
st.stop()
# Process audio
tamil_text = transcribe(audio_path)
english_text = translate_text(tamil_text)
story = generate_text(english_text)
image = generate_image(english_text)
# Display results
st.subheader("Tamil Transcription")
st.write(tamil_text)
st.subheader("English Translation")
st.write(english_text)
st.subheader("Generated Story")
st.write(story)
st.subheader("Generated Image")
st.image(image, caption="Generated Image")
|