Spaces:

Kishorekumar7
/

Voice-to-Text-and-image-GRADIO

Running

App Files Files Community

Kishorekumar7 commited on Apr 4

Commit

8b5b601

verified ·

1 Parent(s): 899527c

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -77

app.py CHANGED Viewed

@@ -1,87 +1,119 @@
 import os
 import gradio as gr
-import torch
-import numpy as np
-import tempfile
-import soundfile as sf  # For saving NumPy array as WAV
 from groq import Groq
-from diffusers import AutoPipelineForText2Image
-# Load API keys
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")
-HF_API_KEY = os.getenv("HF_API_KEY")
-# Initialize Groq client with API key
 client = Groq(api_key=GROQ_API_KEY)
-# Load lightweight Hugging Face image generation model
-image_gen = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/sdxl-turbo", use_auth_token=HF_API_KEY
-)
-image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
-# Function to transcribe Tamil audio using Groq's Whisper
-def transcribe(audio):
-    if audio is None:
-        return "No audio provided"
-    sampling_rate, audio_data = audio  # Unpack tuple
-    # Save audio as a WAV file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
-        sf.write(temp_audio.name, audio_data, sampling_rate)  # Convert NumPy array to WAV
-        temp_audio_path = temp_audio.name  # Get file path
-    # Open and send file to Groq API
-    with open(temp_audio_path, "rb") as file:
-        transcription = client.audio.transcriptions.create(
-            file=("audio.wav", file, "audio/wav"),
-            model="whisper-large-v3",
-            language="ta",
-            response_format="verbose_json"
         )
-    return transcription.text  # Fix: Use dot notation instead of ["text"]
-# Function to translate Tamil to English using Groq's Gemma
-def translate_text(tamil_text):
-    response = client.chat.completions.create(
-        model="gemma2-9b-it",
-        messages=[{"role": "user", "content": f"Translate this Tamil text to English: {tamil_text}"}]
-    )
-    return response.choices[0].message.content  # Fix extraction
-# Function to generate text using Groq's DeepSeek R1
-def generate_text(prompt):
-    response = client.chat.completions.create(
-        model="deepseek-r1-distill-llama-70b",
-        messages=[{"role": "user", "content": f"Write a short story about: {prompt}"}]
-    )
-    return response.choices[0].delta.content
-# Function to generate an image
-def generate_image(prompt):
-    img = image_gen(prompt=prompt).images[0]  # Generate image
-    return img
-# Gradio app
-def process(audio):
-    tamil_text = transcribe(audio)
-    english_text = translate_text(tamil_text)
-    story = generate_text(english_text)
     image = generate_image(english_text)
-    return tamil_text, english_text, story, image
-with gr.Blocks() as demo:
-    gr.Markdown("## Tamil Speech to Image & Story Generator")
-    audio_input = gr.Audio(label="Record your Tamil speech")
-    transcribed_text = gr.Textbox(label="Tamil Text Output")
-    translated_text = gr.Textbox(label="Translated English Text")
-    generated_text = gr.Textbox(label="Generated Story")
-    generated_image = gr.Image(label="Generated Image")
-    btn = gr.Button("Generate")
-    btn.click(process, inputs=[audio_input], outputs=[transcribed_text, translated_text, generated_text, generated_image])
-# Run Gradio app
-demo.launch()

 import os
+import requests
+import io
 import gradio as gr
+from PIL import Image
 from groq import Groq
+# Getting Groq API key from the secret variable.
+GROQ_API_KEY = os.getenv("groq_api")
+# Initialize Groq API client
 client = Groq(api_key=GROQ_API_KEY)
+# Function 1: Tamil Audio to Tamil Text (Transcription)
+def transcribe_audio(audio_path):
+    if not audio_path:
+        return "Please upload an audio file."
+    try:
+        with open(audio_path, "rb") as file:
+            transcription = client.audio.transcriptions.create(
+                file=(os.path.basename(audio_path), file.read()),
+                model="whisper-large-v3",
+                language="ta",  # Tamil
+                response_format="verbose_json",
+            )
+        return transcription.text
+    except Exception as e:
+        return f"Error in transcription: {str(e)}"
+# Function 2: Tamil Text to English Translation
+def translate_tamil_to_english(tamil_text):
+    if not tamil_text:
+        return "Please enter Tamil text for translation."
+    prompt = f"""Translate the below Tamil text to English:\n
+    Tamil Text: {tamil_text}\n
+    Give only the translated part as the output without any extra words."""
+    try:
+        response = client.chat.completions.create(
+            model="gemma2-9b-it",
+            messages=[{"role": "user", "content": prompt}],
         )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        return f"Error in translation: {str(e)}"
+# Function 3: English Text to Image Generation
+def generate_image(english_text):
+    if not english_text:
+        return "Please enter a description for image generation."
+    try:
+        payload = {"inputs": english_text}
+        response = requests.post(f"https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell", json=payload)
+        response.raise_for_status()
+        image_bytes = response.content
+        image = Image.open(io.BytesIO(image_bytes))
+        return image
+    except Exception as e:
+        return f"Error in image generation: {str(e)}"
+# Function 4: English Text to Further Text Generation
+def generate_text(english_text):
+    if not english_text:
+        return "Please enter a prompt."
+    try:
+        response = client.chat.completions.create(
+            model="deepseek-r1-distill-llama-70b",
+            messages=[{"role": "user", "content": english_text}],
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        return f"Error in text generation: {str(e)}"
+# Combined Function to Process All Steps Sequentially
+def process_audio(audio_path):
+    # Step 1: Tamil Audio → Tamil Text
+    tamil_text = transcribe_audio(audio_path)
+    if "Error" in tamil_text:
+        return tamil_text, None, None, None
+    # Step 2: Tamil Text → English Text
+    english_text = translate_tamil_to_english(tamil_text)
+    if "Error" in english_text:
+        return tamil_text, english_text, None, None
+    # Step 3: English Text → Image
     image = generate_image(english_text)
+    if "Error" in str(image):
+        return tamil_text, english_text, None, None
+    # Step 4: English Text → Generated Text
+    generated_text = generate_text(english_text)
+    return tamil_text, english_text, image, generated_text
+# Create Gradio Interface
+iface = gr.Interface(
+    fn=process_audio,
+    inputs=gr.Audio(type="filepath", label="Upload Tamil Audio"),
+    outputs=[
+        gr.Textbox(label="Transcribed Tamil Text"),
+        gr.Textbox(label="Translated English Text"),
+        gr.Image(label="Generated Image"),
+        gr.Textbox(label="Generated Text from English Prompt"),
+    ],
+    title="TransArt: A Multimodal Application for Vernacular Language Translation and Image Synthesis",
+    description="""Upload a Tamil audio file or live voice record Tamil audio and
+    get transcription, translation, image generation, and further text generation."""
+)
+# Launch the Gradio app
+iface.launch()