Kishorekumar7 commited on
Commit
8b5b601
Β·
verified Β·
1 Parent(s): 899527c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -77
app.py CHANGED
@@ -1,87 +1,119 @@
1
  import os
 
 
2
  import gradio as gr
3
- import torch
4
- import numpy as np
5
- import tempfile
6
- import soundfile as sf # For saving NumPy array as WAV
7
  from groq import Groq
8
- from diffusers import AutoPipelineForText2Image
9
 
10
- # Load API keys
11
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
12
- HF_API_KEY = os.getenv("HF_API_KEY")
13
 
14
- # Initialize Groq client with API key
 
 
 
15
  client = Groq(api_key=GROQ_API_KEY)
16
 
17
- # Load lightweight Hugging Face image generation model
18
- image_gen = AutoPipelineForText2Image.from_pretrained(
19
- "stabilityai/sdxl-turbo", use_auth_token=HF_API_KEY
20
- )
21
- image_gen.to("cuda" if torch.cuda.is_available() else "cpu")
22
-
23
- # Function to transcribe Tamil audio using Groq's Whisper
24
- def transcribe(audio):
25
- if audio is None:
26
- return "No audio provided"
27
-
28
- sampling_rate, audio_data = audio # Unpack tuple
29
-
30
- # Save audio as a WAV file
31
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
32
- sf.write(temp_audio.name, audio_data, sampling_rate) # Convert NumPy array to WAV
33
- temp_audio_path = temp_audio.name # Get file path
34
-
35
- # Open and send file to Groq API
36
- with open(temp_audio_path, "rb") as file:
37
- transcription = client.audio.transcriptions.create(
38
- file=("audio.wav", file, "audio/wav"),
39
- model="whisper-large-v3",
40
- language="ta",
41
- response_format="verbose_json"
 
 
 
 
 
42
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- return transcription.text # Fix: Use dot notation instead of ["text"]
45
-
46
- # Function to translate Tamil to English using Groq's Gemma
47
- def translate_text(tamil_text):
48
- response = client.chat.completions.create(
49
- model="gemma2-9b-it",
50
- messages=[{"role": "user", "content": f"Translate this Tamil text to English: {tamil_text}"}]
51
- )
52
- return response.choices[0].message.content # Fix extraction
53
-
54
- # Function to generate text using Groq's DeepSeek R1
55
- def generate_text(prompt):
56
- response = client.chat.completions.create(
57
- model="deepseek-r1-distill-llama-70b",
58
- messages=[{"role": "user", "content": f"Write a short story about: {prompt}"}]
59
- )
60
- return response.choices[0].delta.content
61
-
62
- # Function to generate an image
63
- def generate_image(prompt):
64
- img = image_gen(prompt=prompt).images[0] # Generate image
65
- return img
66
-
67
- # Gradio app
68
- def process(audio):
69
- tamil_text = transcribe(audio)
70
- english_text = translate_text(tamil_text)
71
- story = generate_text(english_text)
 
72
  image = generate_image(english_text)
73
- return tamil_text, english_text, story, image
74
-
75
- with gr.Blocks() as demo:
76
- gr.Markdown("## Tamil Speech to Image & Story Generator")
77
- audio_input = gr.Audio(label="Record your Tamil speech")
78
- transcribed_text = gr.Textbox(label="Tamil Text Output")
79
- translated_text = gr.Textbox(label="Translated English Text")
80
- generated_text = gr.Textbox(label="Generated Story")
81
- generated_image = gr.Image(label="Generated Image")
82
-
83
- btn = gr.Button("Generate")
84
- btn.click(process, inputs=[audio_input], outputs=[transcribed_text, translated_text, generated_text, generated_image])
85
-
86
- # Run Gradio app
87
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import requests
3
+ import io
4
  import gradio as gr
5
+ from PIL import Image
 
 
 
6
  from groq import Groq
 
7
 
 
 
 
8
 
9
+ # Getting Groq API key from the secret variable.
10
+ GROQ_API_KEY = os.getenv("groq_api")
11
+
12
+ # Initialize Groq API client
13
  client = Groq(api_key=GROQ_API_KEY)
14
 
15
+
16
+ # Function 1: Tamil Audio to Tamil Text (Transcription)
17
+ def transcribe_audio(audio_path):
18
+ if not audio_path:
19
+ return "Please upload an audio file."
20
+ try:
21
+ with open(audio_path, "rb") as file:
22
+ transcription = client.audio.transcriptions.create(
23
+ file=(os.path.basename(audio_path), file.read()),
24
+ model="whisper-large-v3",
25
+ language="ta", # Tamil
26
+ response_format="verbose_json",
27
+ )
28
+ return transcription.text
29
+ except Exception as e:
30
+ return f"Error in transcription: {str(e)}"
31
+
32
+
33
+ # Function 2: Tamil Text to English Translation
34
+ def translate_tamil_to_english(tamil_text):
35
+ if not tamil_text:
36
+ return "Please enter Tamil text for translation."
37
+
38
+ prompt = f"""Translate the below Tamil text to English:\n
39
+ Tamil Text: {tamil_text}\n
40
+ Give only the translated part as the output without any extra words."""
41
+ try:
42
+ response = client.chat.completions.create(
43
+ model="gemma2-9b-it",
44
+ messages=[{"role": "user", "content": prompt}],
45
  )
46
+ return response.choices[0].message.content.strip()
47
+ except Exception as e:
48
+ return f"Error in translation: {str(e)}"
49
+
50
+
51
+ # Function 3: English Text to Image Generation
52
+ def generate_image(english_text):
53
+ if not english_text:
54
+ return "Please enter a description for image generation."
55
+ try:
56
+ payload = {"inputs": english_text}
57
+ response = requests.post(f"https://api-inference.huggingface.co/models/black-forest-labs/FLUX.1-schnell", json=payload)
58
+ response.raise_for_status()
59
+ image_bytes = response.content
60
+ image = Image.open(io.BytesIO(image_bytes))
61
+ return image
62
+ except Exception as e:
63
+ return f"Error in image generation: {str(e)}"
64
 
65
+
66
+ # Function 4: English Text to Further Text Generation
67
+ def generate_text(english_text):
68
+ if not english_text:
69
+ return "Please enter a prompt."
70
+
71
+ try:
72
+ response = client.chat.completions.create(
73
+ model="deepseek-r1-distill-llama-70b",
74
+ messages=[{"role": "user", "content": english_text}],
75
+ )
76
+ return response.choices[0].message.content.strip()
77
+ except Exception as e:
78
+ return f"Error in text generation: {str(e)}"
79
+
80
+
81
+ # Combined Function to Process All Steps Sequentially
82
+ def process_audio(audio_path):
83
+ # Step 1: Tamil Audio β†’ Tamil Text
84
+ tamil_text = transcribe_audio(audio_path)
85
+ if "Error" in tamil_text:
86
+ return tamil_text, None, None, None
87
+
88
+ # Step 2: Tamil Text β†’ English Text
89
+ english_text = translate_tamil_to_english(tamil_text)
90
+ if "Error" in english_text:
91
+ return tamil_text, english_text, None, None
92
+
93
+ # Step 3: English Text β†’ Image
94
  image = generate_image(english_text)
95
+ if "Error" in str(image):
96
+ return tamil_text, english_text, None, None
97
+
98
+ # Step 4: English Text β†’ Generated Text
99
+ generated_text = generate_text(english_text)
100
+ return tamil_text, english_text, image, generated_text
101
+
102
+
103
+ # Create Gradio Interface
104
+ iface = gr.Interface(
105
+ fn=process_audio,
106
+ inputs=gr.Audio(type="filepath", label="Upload Tamil Audio"),
107
+ outputs=[
108
+ gr.Textbox(label="Transcribed Tamil Text"),
109
+ gr.Textbox(label="Translated English Text"),
110
+ gr.Image(label="Generated Image"),
111
+ gr.Textbox(label="Generated Text from English Prompt"),
112
+ ],
113
+ title="TransArt: A Multimodal Application for Vernacular Language Translation and Image Synthesis",
114
+ description="""Upload a Tamil audio file or live voice record Tamil audio and
115
+ get transcription, translation, image generation, and further text generation."""
116
+ )
117
+
118
+ # Launch the Gradio app
119
+ iface.launch()