Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,239 +1,102 @@
|
|
1 |
-
# Import spaces first to avoid CUDA initialization issues
|
2 |
-
import spaces
|
3 |
-
|
4 |
-
# Then import other libraries
|
5 |
import torch
|
6 |
import librosa
|
7 |
-
from transformers import
|
8 |
from gtts import gTTS
|
9 |
import gradio as gr
|
|
|
10 |
from PIL import Image
|
11 |
import os
|
12 |
-
import base64
|
13 |
-
from io import BytesIO
|
14 |
-
|
15 |
-
import io
|
16 |
-
import subprocess
|
17 |
from langdetect import detect
|
18 |
-
|
19 |
-
print("Using GPU for operations when available")
|
20 |
|
21 |
# Install flash-attn
|
22 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
23 |
|
24 |
-
|
25 |
-
@spaces.GPU
|
26 |
-
def load_pipeline(model_name, **kwargs):
|
27 |
-
try:
|
28 |
-
device = 0 if torch.cuda.is_available() else "cpu"
|
29 |
-
return pipeline(model=model_name, device=device, **kwargs)
|
30 |
-
except Exception as e:
|
31 |
-
print(f"Error loading {model_name} pipeline: {e}")
|
32 |
-
return None
|
33 |
-
|
34 |
-
# Load Whisper model for speech recognition within a GPU-decorated function
|
35 |
-
@spaces.GPU
|
36 |
-
def load_whisper():
|
37 |
-
try:
|
38 |
-
device = 0 if torch.cuda.is_available() else "cpu"
|
39 |
-
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
|
40 |
-
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
|
41 |
-
return processor, model
|
42 |
-
except Exception as e:
|
43 |
-
print(f"Error loading Whisper model: {e}")
|
44 |
-
return None, None
|
45 |
-
|
46 |
-
# Load sarvam-2b for text generation within a GPU-decorated function
|
47 |
-
@spaces.GPU
|
48 |
-
def load_sarvam():
|
49 |
-
return load_pipeline('sarvamai/sarvam-2b-v0.5')
|
50 |
-
|
51 |
-
# Load Phi-3.5-vision-instruct model
|
52 |
-
@spaces.GPU
|
53 |
-
def load_vision_model():
|
54 |
-
try:
|
55 |
-
model_id = "microsoft/Phi-3.5-vision-instruct"
|
56 |
-
model = AutoModelForCausalLM.from_pretrained(
|
57 |
-
model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
|
58 |
-
)
|
59 |
-
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
|
60 |
-
return model, processor
|
61 |
-
except Exception as e:
|
62 |
-
print(f"Error loading vision model: {e}")
|
63 |
-
return None, None
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
-
#
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
|
71 |
-
#
|
72 |
-
|
73 |
-
def load_vision_model():
|
74 |
-
try:
|
75 |
-
print("Starting to load vision model...")
|
76 |
-
model_id = "microsoft/Phi-3.5-vision-instruct"
|
77 |
-
print(f"Loading model from {model_id}")
|
78 |
-
|
79 |
-
# Check for CUDA availability
|
80 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
81 |
-
print(f"Using device: {device}")
|
82 |
-
|
83 |
-
# Load model with potential memory optimization
|
84 |
-
model = AutoModelForCausalLM.from_pretrained(
|
85 |
-
model_id,
|
86 |
-
trust_remote_code=True,
|
87 |
-
torch_dtype=torch.float16,
|
88 |
-
use_flash_attention_2=True, # Enable if supported
|
89 |
-
device_map="auto", # Automatically manage model placement
|
90 |
-
low_cpu_mem_usage=True
|
91 |
-
)
|
92 |
-
print("Model loaded successfully")
|
93 |
-
|
94 |
-
print("Loading processor...")
|
95 |
-
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
|
96 |
-
print("Processor loaded successfully")
|
97 |
-
|
98 |
-
return model, processor
|
99 |
-
except ImportError as e:
|
100 |
-
print(f"Error importing required modules: {str(e)}")
|
101 |
-
print("Please ensure all required dependencies are installed.")
|
102 |
-
except RuntimeError as e:
|
103 |
-
print(f"Runtime error (possibly CUDA out of memory): {str(e)}")
|
104 |
-
print("Consider using a smaller model or enabling GPU offloading.")
|
105 |
-
except Exception as e:
|
106 |
-
print(f"Unexpected error in loading vision model: {str(e)}")
|
107 |
-
|
108 |
-
return None, None
|
109 |
|
|
|
110 |
|
111 |
-
# Process audio input within a GPU-decorated function
|
112 |
@spaces.GPU
|
113 |
-
def process_audio_input(audio
|
114 |
-
if whisper_processor is None or whisper_model is None:
|
115 |
-
return "Error: Speech recognition model is not available. Please type your message instead."
|
116 |
-
|
117 |
try:
|
|
|
118 |
audio, sr = librosa.load(audio, sr=16000)
|
119 |
-
input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(
|
120 |
predicted_ids = whisper_model.generate(input_features)
|
121 |
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
|
|
122 |
return transcription
|
123 |
except Exception as e:
|
124 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
125 |
|
126 |
-
# Updated process_image_input function
|
127 |
-
@spaces.GPU
|
128 |
@spaces.GPU
|
129 |
-
def process_image_input(image, text_prompt
|
130 |
-
if vision_model is None or processor is None:
|
131 |
-
return "Error: Vision model is not available."
|
132 |
-
|
133 |
try:
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
# Process the formatted prompt
|
148 |
-
inputs = processor(text=formatted_prompt, return_tensors="pt").to(vision_model.device)
|
149 |
-
|
150 |
-
# Generate text
|
151 |
-
with torch.no_grad():
|
152 |
-
outputs = vision_model.generate(
|
153 |
-
**inputs,
|
154 |
-
max_new_tokens=100,
|
155 |
-
do_sample=True,
|
156 |
-
top_k=50,
|
157 |
-
top_p=0.95,
|
158 |
-
num_return_sequences=1
|
159 |
-
)
|
160 |
-
|
161 |
-
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
162 |
-
return generated_text
|
163 |
except Exception as e:
|
164 |
return f"Error processing image: {str(e)}"
|
165 |
|
166 |
-
|
167 |
-
@spaces.GPU
|
168 |
-
def generate_response(transcription, sarvam_pipe):
|
169 |
-
if sarvam_pipe is None:
|
170 |
-
return "Error: Text generation model is not available."
|
171 |
-
|
172 |
try:
|
173 |
-
# Generate response using the sarvam-2b model
|
174 |
response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
|
175 |
return response
|
176 |
except Exception as e:
|
177 |
return f"Error generating response: {str(e)}"
|
178 |
|
179 |
-
# Text-to-speech function
|
180 |
def text_to_speech(text, lang='hi'):
|
181 |
try:
|
182 |
-
|
183 |
-
if lang in ['hi', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']:
|
184 |
-
# You might want to use a different TTS library here
|
185 |
-
# For example, you could use the Google Cloud Text-to-Speech API
|
186 |
-
# or a specialized Indic language TTS library
|
187 |
-
|
188 |
-
# This is a placeholder for a better Indic TTS solution
|
189 |
-
tts = gTTS(text=text, lang=lang, tld='co.in') # Use Indian TLD
|
190 |
-
else:
|
191 |
-
tts = gTTS(text=text, lang=lang)
|
192 |
-
|
193 |
tts.save("response.mp3")
|
194 |
return "response.mp3"
|
195 |
except Exception as e:
|
196 |
print(f"Error in text-to-speech: {str(e)}")
|
197 |
return None
|
198 |
|
199 |
-
# Improved language detection function
|
200 |
-
def detect_language(text):
|
201 |
-
lang_codes = {
|
202 |
-
'bn': 'Bengali', 'gu': 'Gujarati', 'hi': 'Hindi', 'kn': 'Kannada',
|
203 |
-
'ml': 'Malayalam', 'mr': 'Marathi', 'or': 'Oriya', 'pa': 'Punjabi',
|
204 |
-
'ta': 'Tamil', 'te': 'Telugu', 'en': 'English'
|
205 |
-
}
|
206 |
-
|
207 |
-
try:
|
208 |
-
detected_lang = detect(text)
|
209 |
-
return detected_lang if detected_lang in lang_codes else 'en'
|
210 |
-
except:
|
211 |
-
# Fallback to simple script-based detection
|
212 |
-
for code, lang in lang_codes.items():
|
213 |
-
if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text): # Devanagari script
|
214 |
-
return 'hi'
|
215 |
-
return 'en' # Default to English if no Indic script is detected
|
216 |
-
|
217 |
@spaces.GPU
|
218 |
def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
219 |
try:
|
220 |
-
whisper_processor, whisper_model = load_whisper()
|
221 |
-
sarvam_pipe = load_sarvam()
|
222 |
-
vision_model, processor = load_vision_model()
|
223 |
-
|
224 |
if input_type == "audio" and audio_input is not None:
|
225 |
-
transcription = process_audio_input(audio_input
|
226 |
elif input_type == "text" and text_input:
|
227 |
transcription = text_input
|
228 |
elif input_type == "image" and image_input is not None:
|
229 |
-
# Use a default prompt if no text input is provided
|
230 |
text_prompt = text_input if text_input else "Describe this image in detail."
|
231 |
-
transcription = process_image_input(image_input, text_prompt
|
232 |
else:
|
233 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
234 |
|
235 |
-
response = generate_response(transcription
|
236 |
-
lang =
|
237 |
audio_response = text_to_speech(response, lang)
|
238 |
|
239 |
return transcription, response, audio_response
|
@@ -241,7 +104,6 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
|
241 |
error_message = f"An error occurred: {str(e)}"
|
242 |
return error_message, error_message, None
|
243 |
|
244 |
-
|
245 |
# Custom CSS
|
246 |
custom_css = """
|
247 |
body {
|
@@ -266,33 +128,7 @@ body {
|
|
266 |
#custom-header h1 .pink {
|
267 |
color: #f472b6;
|
268 |
}
|
269 |
-
#custom-header h2 {
|
270 |
-
def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
271 |
-
try:
|
272 |
-
whisper_processor, whisper_model = load_whisper()
|
273 |
-
sarvam_pipe = load_sarvam()
|
274 |
-
vision_model, processor = load_vision_model()
|
275 |
-
|
276 |
-
if input_type == "audio" and audio_input is not None:
|
277 |
-
transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
|
278 |
-
elif input_type == "text" and text_input:
|
279 |
-
transcription = text_input
|
280 |
-
elif input_type == "image" and image_input is not None:
|
281 |
-
# Use a default prompt if no text input is provided
|
282 |
-
text_prompt = text_input if text_input else "Describe this image in detail."
|
283 |
-
transcription = process_image_input(image_input, text_prompt, vision_model, processor)
|
284 |
-
else:
|
285 |
-
return "Please provide either audio, text, or image input.", "No input provided.", None
|
286 |
-
|
287 |
-
response = generate_response(transcription, sarvam_pipe)
|
288 |
-
lang = detect_language(response)
|
289 |
-
audio_response = text_to_speech(response, lang)
|
290 |
-
|
291 |
-
return transcription, response, audio_response
|
292 |
-
except Exception as e:
|
293 |
-
error_message = f"An error occurred: {str(e)}"
|
294 |
-
return error_message, error_message, None
|
295 |
-
|
296 |
font-size: 1.5rem;
|
297 |
color: #94a3b8;
|
298 |
}
|
@@ -371,7 +207,8 @@ custom_suggestions = """
|
|
371 |
</div>
|
372 |
</div>
|
373 |
"""
|
374 |
-
|
|
|
375 |
with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
376 |
body_background_fill="#0b0f19",
|
377 |
body_text_color="#e2e8f0",
|
@@ -405,5 +242,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
|
405 |
outputs=[output_transcription, output_response, output_audio]
|
406 |
)
|
407 |
gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
|
|
|
408 |
# Launch the app
|
409 |
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import librosa
|
3 |
+
from transformers import AutoModelForCausalLM, AutoProcessor, pipeline, WhisperProcessor, WhisperForConditionalGeneration
|
4 |
from gtts import gTTS
|
5 |
import gradio as gr
|
6 |
+
import spaces
|
7 |
from PIL import Image
|
8 |
import os
|
|
|
|
|
|
|
|
|
|
|
9 |
from langdetect import detect
|
10 |
+
import subprocess
|
|
|
11 |
|
12 |
# Install flash-attn
|
13 |
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
14 |
|
15 |
+
print("Loading models...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
# Vision model
|
18 |
+
vision_model_id = "microsoft/Phi-3.5-vision-instruct"
|
19 |
+
vision_model = AutoModelForCausalLM.from_pretrained(
|
20 |
+
vision_model_id,
|
21 |
+
trust_remote_code=True,
|
22 |
+
torch_dtype=torch.float16,
|
23 |
+
use_flash_attention_2=False
|
24 |
+
)
|
25 |
+
vision_processor = AutoProcessor.from_pretrained(vision_model_id, trust_remote_code=True, num_crops=16)
|
26 |
|
27 |
+
# Whisper model
|
28 |
+
whisper_model_id = "openai/whisper-small"
|
29 |
+
whisper_processor = WhisperProcessor.from_pretrained(whisper_model_id)
|
30 |
+
whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_id)
|
31 |
|
32 |
+
# Sarvam model
|
33 |
+
sarvam_pipe = pipeline('sarvamai/sarvam-2b-v0.5')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
print("All models loaded successfully")
|
36 |
|
|
|
37 |
@spaces.GPU
|
38 |
+
def process_audio_input(audio):
|
|
|
|
|
|
|
39 |
try:
|
40 |
+
whisper_model.to('cuda')
|
41 |
audio, sr = librosa.load(audio, sr=16000)
|
42 |
+
input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to('cuda')
|
43 |
predicted_ids = whisper_model.generate(input_features)
|
44 |
transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
45 |
+
whisper_model.to('cpu')
|
46 |
return transcription
|
47 |
except Exception as e:
|
48 |
return f"Error processing audio: {str(e)}. Please type your message instead."
|
49 |
|
|
|
|
|
50 |
@spaces.GPU
|
51 |
+
def process_image_input(image, text_prompt):
|
|
|
|
|
|
|
52 |
try:
|
53 |
+
vision_model.to('cuda')
|
54 |
+
messages = [
|
55 |
+
{"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
|
56 |
+
]
|
57 |
+
prompt = vision_processor.tokenizer.apply_chat_template(
|
58 |
+
messages, tokenize=False, add_generation_prompt=True
|
59 |
+
)
|
60 |
+
inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda")
|
61 |
+
generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
|
62 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
63 |
+
response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
64 |
+
vision_model.to('cpu')
|
65 |
+
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
except Exception as e:
|
67 |
return f"Error processing image: {str(e)}"
|
68 |
|
69 |
+
def generate_response(transcription):
|
|
|
|
|
|
|
|
|
|
|
70 |
try:
|
|
|
71 |
response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
|
72 |
return response
|
73 |
except Exception as e:
|
74 |
return f"Error generating response: {str(e)}"
|
75 |
|
|
|
76 |
def text_to_speech(text, lang='hi'):
|
77 |
try:
|
78 |
+
tts = gTTS(text=text, lang=lang, tld='co.in')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
tts.save("response.mp3")
|
80 |
return "response.mp3"
|
81 |
except Exception as e:
|
82 |
print(f"Error in text-to-speech: {str(e)}")
|
83 |
return None
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
@spaces.GPU
|
86 |
def indic_vision_assistant(input_type, audio_input, text_input, image_input):
|
87 |
try:
|
|
|
|
|
|
|
|
|
88 |
if input_type == "audio" and audio_input is not None:
|
89 |
+
transcription = process_audio_input(audio_input)
|
90 |
elif input_type == "text" and text_input:
|
91 |
transcription = text_input
|
92 |
elif input_type == "image" and image_input is not None:
|
|
|
93 |
text_prompt = text_input if text_input else "Describe this image in detail."
|
94 |
+
transcription = process_image_input(image_input, text_prompt)
|
95 |
else:
|
96 |
return "Please provide either audio, text, or image input.", "No input provided.", None
|
97 |
|
98 |
+
response = generate_response(transcription)
|
99 |
+
lang = detect(response)
|
100 |
audio_response = text_to_speech(response, lang)
|
101 |
|
102 |
return transcription, response, audio_response
|
|
|
104 |
error_message = f"An error occurred: {str(e)}"
|
105 |
return error_message, error_message, None
|
106 |
|
|
|
107 |
# Custom CSS
|
108 |
custom_css = """
|
109 |
body {
|
|
|
128 |
#custom-header h1 .pink {
|
129 |
color: #f472b6;
|
130 |
}
|
131 |
+
#custom-header h2 {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
font-size: 1.5rem;
|
133 |
color: #94a3b8;
|
134 |
}
|
|
|
207 |
</div>
|
208 |
</div>
|
209 |
"""
|
210 |
+
|
211 |
+
# Gradio interface
|
212 |
with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
|
213 |
body_background_fill="#0b0f19",
|
214 |
body_text_color="#e2e8f0",
|
|
|
242 |
outputs=[output_transcription, output_response, output_audio]
|
243 |
)
|
244 |
gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
|
245 |
+
|
246 |
# Launch the app
|
247 |
iface.launch()
|