Update app.py
Browse files
app.py
CHANGED
@@ -1,87 +1,119 @@
|
|
1 |
import os
|
|
|
|
|
2 |
import gradio as gr
|
3 |
-
import
|
4 |
-
import numpy as np
|
5 |
-
import tempfile
|
6 |
-
import soundfile as sf # For saving NumPy array as WAV
|
7 |
from groq import Groq
|
8 |
-
from diffusers import AutoPipelineForText2Image
|
9 |
|
10 |
-
# Load API keys
|
11 |
-
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
12 |
-
HF_API_KEY = os.getenv("HF_API_KEY")
|
13 |
|
14 |
-
#
|
|
|
|
|
|
|
15 |
client = Groq(api_key=GROQ_API_KEY)
|
16 |
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
42 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
#
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
#
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
72 |
image = generate_image(english_text)
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
import requests
|
3 |
+
import io
|
4 |
import gradio as gr
|
5 |
+
from PIL import Image
|
|
|
|
|
|
|
6 |
from groq import Groq
|
|
|
7 |
|
|
|
|
|
|
|
8 |
|
9 |
+
# Getting Groq API key from the secret variable.
|
10 |
+
GROQ_API_KEY = os.getenv("groq_api")
|
11 |
+
|
12 |
+
# Initialize Groq API client
|
13 |
client = Groq(api_key=GROQ_API_KEY)
|
14 |
|
15 |
+
|
16 |
+
# Function 1: Tamil Audio to Tamil Text (Transcription)
|
17 |
+
def transcribe_audio(audio_path):
|
18 |
+
if not audio_path:
|
19 |
+
return "Please upload an audio file."
|
20 |
+
try:
|
21 |
+
with open(audio_path, "rb") as file:
|
22 |
+
transcription = client.audio.transcriptions.create(
|
23 |
+
file=(os.path.basename(audio_path), file.read()),
|
24 |
+
model="whisper-large-v3",
|
25 |
+
language="ta", # Tamil
|
26 |
+
response_format="verbose_json",
|
27 |
+
)
|
28 |
+
return transcription.text
|
29 |
+
except Exception as e:
|
30 |
+
return f"Error in transcription: {str(e)}"
|
31 |
+
|
32 |
+
|
33 |
+
# Function 2: Tamil Text to English Translation
|
34 |
+
def translate_tamil_to_english(tamil_text):
|
35 |
+
if not tamil_text:
|
36 |
+
return "Please enter Tamil text for translation."
|
37 |
+
|
38 |
+
prompt = f"""Translate the below Tamil text to English:\n
|
39 |
+
Tamil Text: {tamil_text}\n
|
40 |
+
Give only the translated part as the output without any extra words."""
|
41 |
+
try:
|
42 |
+
response = client.chat.completions.create(
|
43 |
+
model="gemma2-9b-it",
|
44 |
+
messages=[{"role": "user", "content": prompt}],
|
45 |
)
|
46 |
+
return response.choices[0].message.content.strip()
|
47 |
+
except Exception as e:
|
48 |
+
return f"Error in translation: {str(e)}"
|
49 |
+
|
50 |
+
|
51 |
+
# Function 3: English Text to Image Generation
|
52 |
+
def generate_image(english_text):
|
53 |
+
if not english_text:
|
54 |
+
return "Please enter a description for image generation."
|
55 |
+
try:
|
56 |
+
payload = {"inputs": english_text}
|
57 |
+
response = requests.post(f"https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell", json=payload)
|
58 |
+
response.raise_for_status()
|
59 |
+
image_bytes = response.content
|
60 |
+
image = Image.open(io.BytesIO(image_bytes))
|
61 |
+
return image
|
62 |
+
except Exception as e:
|
63 |
+
return f"Error in image generation: {str(e)}"
|
64 |
|
65 |
+
|
66 |
+
# Function 4: English Text to Further Text Generation
|
67 |
+
def generate_text(english_text):
|
68 |
+
if not english_text:
|
69 |
+
return "Please enter a prompt."
|
70 |
+
|
71 |
+
try:
|
72 |
+
response = client.chat.completions.create(
|
73 |
+
model="deepseek-r1-distill-llama-70b",
|
74 |
+
messages=[{"role": "user", "content": english_text}],
|
75 |
+
)
|
76 |
+
return response.choices[0].message.content.strip()
|
77 |
+
except Exception as e:
|
78 |
+
return f"Error in text generation: {str(e)}"
|
79 |
+
|
80 |
+
|
81 |
+
# Combined Function to Process All Steps Sequentially
|
82 |
+
def process_audio(audio_path):
|
83 |
+
# Step 1: Tamil Audio β Tamil Text
|
84 |
+
tamil_text = transcribe_audio(audio_path)
|
85 |
+
if "Error" in tamil_text:
|
86 |
+
return tamil_text, None, None, None
|
87 |
+
|
88 |
+
# Step 2: Tamil Text β English Text
|
89 |
+
english_text = translate_tamil_to_english(tamil_text)
|
90 |
+
if "Error" in english_text:
|
91 |
+
return tamil_text, english_text, None, None
|
92 |
+
|
93 |
+
# Step 3: English Text β Image
|
94 |
image = generate_image(english_text)
|
95 |
+
if "Error" in str(image):
|
96 |
+
return tamil_text, english_text, None, None
|
97 |
+
|
98 |
+
# Step 4: English Text β Generated Text
|
99 |
+
generated_text = generate_text(english_text)
|
100 |
+
return tamil_text, english_text, image, generated_text
|
101 |
+
|
102 |
+
|
103 |
+
# Create Gradio Interface
|
104 |
+
iface = gr.Interface(
|
105 |
+
fn=process_audio,
|
106 |
+
inputs=gr.Audio(type="filepath", label="Upload Tamil Audio"),
|
107 |
+
outputs=[
|
108 |
+
gr.Textbox(label="Transcribed Tamil Text"),
|
109 |
+
gr.Textbox(label="Translated English Text"),
|
110 |
+
gr.Image(label="Generated Image"),
|
111 |
+
gr.Textbox(label="Generated Text from English Prompt"),
|
112 |
+
],
|
113 |
+
title="TransArt: A Multimodal Application for Vernacular Language Translation and Image Synthesis",
|
114 |
+
description="""Upload a Tamil audio file or live voice record Tamil audio and
|
115 |
+
get transcription, translation, image generation, and further text generation."""
|
116 |
+
)
|
117 |
+
|
118 |
+
# Launch the Gradio app
|
119 |
+
iface.launch()
|