sagar007 commited on
Commit
f073c65
·
verified ·
1 Parent(s): 7dbf49f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -210
app.py CHANGED
@@ -1,239 +1,102 @@
1
- # Import spaces first to avoid CUDA initialization issues
2
- import spaces
3
-
4
- # Then import other libraries
5
  import torch
6
  import librosa
7
- from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
8
  from gtts import gTTS
9
  import gradio as gr
 
10
  from PIL import Image
11
  import os
12
- import base64
13
- from io import BytesIO
14
-
15
- import io
16
- import subprocess
17
  from langdetect import detect
18
-
19
- print("Using GPU for operations when available")
20
 
21
  # Install flash-attn
22
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
23
 
24
- # Function to safely load pipeline within a GPU-decorated function
25
- @spaces.GPU
26
- def load_pipeline(model_name, **kwargs):
27
- try:
28
- device = 0 if torch.cuda.is_available() else "cpu"
29
- return pipeline(model=model_name, device=device, **kwargs)
30
- except Exception as e:
31
- print(f"Error loading {model_name} pipeline: {e}")
32
- return None
33
-
34
- # Load Whisper model for speech recognition within a GPU-decorated function
35
- @spaces.GPU
36
- def load_whisper():
37
- try:
38
- device = 0 if torch.cuda.is_available() else "cpu"
39
- processor = WhisperProcessor.from_pretrained("openai/whisper-small")
40
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
41
- return processor, model
42
- except Exception as e:
43
- print(f"Error loading Whisper model: {e}")
44
- return None, None
45
-
46
- # Load sarvam-2b for text generation within a GPU-decorated function
47
- @spaces.GPU
48
- def load_sarvam():
49
- return load_pipeline('sarvamai/sarvam-2b-v0.5')
50
-
51
- # Load Phi-3.5-vision-instruct model
52
- @spaces.GPU
53
- def load_vision_model():
54
- try:
55
- model_id = "microsoft/Phi-3.5-vision-instruct"
56
- model = AutoModelForCausalLM.from_pretrained(
57
- model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
58
- )
59
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
60
- return model, processor
61
- except Exception as e:
62
- print(f"Error loading vision model: {e}")
63
- return None, None
64
 
 
 
 
 
 
 
 
 
 
65
 
66
- # Load sarvam-2b for text generation within a GPU-decorated function
67
- @spaces.GPU
68
- def load_sarvam():
69
- return load_pipeline('sarvamai/sarvam-2b-v0.5')
70
 
71
- # Load Phi-3.5-vision-instruct model
72
- @spaces.GPU
73
- def load_vision_model():
74
- try:
75
- print("Starting to load vision model...")
76
- model_id = "microsoft/Phi-3.5-vision-instruct"
77
- print(f"Loading model from {model_id}")
78
-
79
- # Check for CUDA availability
80
- device = "cuda" if torch.cuda.is_available() else "cpu"
81
- print(f"Using device: {device}")
82
-
83
- # Load model with potential memory optimization
84
- model = AutoModelForCausalLM.from_pretrained(
85
- model_id,
86
- trust_remote_code=True,
87
- torch_dtype=torch.float16,
88
- use_flash_attention_2=True, # Enable if supported
89
- device_map="auto", # Automatically manage model placement
90
- low_cpu_mem_usage=True
91
- )
92
- print("Model loaded successfully")
93
-
94
- print("Loading processor...")
95
- processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
96
- print("Processor loaded successfully")
97
-
98
- return model, processor
99
- except ImportError as e:
100
- print(f"Error importing required modules: {str(e)}")
101
- print("Please ensure all required dependencies are installed.")
102
- except RuntimeError as e:
103
- print(f"Runtime error (possibly CUDA out of memory): {str(e)}")
104
- print("Consider using a smaller model or enabling GPU offloading.")
105
- except Exception as e:
106
- print(f"Unexpected error in loading vision model: {str(e)}")
107
-
108
- return None, None
109
 
 
110
 
111
- # Process audio input within a GPU-decorated function
112
  @spaces.GPU
113
- def process_audio_input(audio, whisper_processor, whisper_model):
114
- if whisper_processor is None or whisper_model is None:
115
- return "Error: Speech recognition model is not available. Please type your message instead."
116
-
117
  try:
 
118
  audio, sr = librosa.load(audio, sr=16000)
119
- input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
120
  predicted_ids = whisper_model.generate(input_features)
121
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
 
122
  return transcription
123
  except Exception as e:
124
  return f"Error processing audio: {str(e)}. Please type your message instead."
125
 
126
- # Updated process_image_input function
127
- @spaces.GPU
128
  @spaces.GPU
129
- def process_image_input(image, text_prompt, vision_model, processor):
130
- if vision_model is None or processor is None:
131
- return "Error: Vision model is not available."
132
-
133
  try:
134
- # Convert image to base64
135
- if isinstance(image, Image.Image):
136
- buffered = BytesIO()
137
- image.save(buffered, format="PNG")
138
- img_str = base64.b64encode(buffered.getvalue()).decode()
139
- else:
140
- # If it's not a PIL Image, assume it's a file path
141
- with open(image, "rb") as image_file:
142
- img_str = base64.b64encode(image_file.read()).decode()
143
-
144
- # Format the input with image tag
145
- formatted_prompt = f"{text_prompt}\n<image>data:image/png;base64,{img_str}</image>"
146
-
147
- # Process the formatted prompt
148
- inputs = processor(text=formatted_prompt, return_tensors="pt").to(vision_model.device)
149
-
150
- # Generate text
151
- with torch.no_grad():
152
- outputs = vision_model.generate(
153
- **inputs,
154
- max_new_tokens=100,
155
- do_sample=True,
156
- top_k=50,
157
- top_p=0.95,
158
- num_return_sequences=1
159
- )
160
-
161
- generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
162
- return generated_text
163
  except Exception as e:
164
  return f"Error processing image: {str(e)}"
165
 
166
- # Generate response within a GPU-decorated function
167
- @spaces.GPU
168
- def generate_response(transcription, sarvam_pipe):
169
- if sarvam_pipe is None:
170
- return "Error: Text generation model is not available."
171
-
172
  try:
173
- # Generate response using the sarvam-2b model
174
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
175
  return response
176
  except Exception as e:
177
  return f"Error generating response: {str(e)}"
178
 
179
- # Text-to-speech function
180
  def text_to_speech(text, lang='hi'):
181
  try:
182
- # Use a better TTS engine for Indic languages
183
- if lang in ['hi', 'bn', 'gu', 'kn', 'ml', 'mr', 'or', 'pa', 'ta', 'te']:
184
- # You might want to use a different TTS library here
185
- # For example, you could use the Google Cloud Text-to-Speech API
186
- # or a specialized Indic language TTS library
187
-
188
- # This is a placeholder for a better Indic TTS solution
189
- tts = gTTS(text=text, lang=lang, tld='co.in') # Use Indian TLD
190
- else:
191
- tts = gTTS(text=text, lang=lang)
192
-
193
  tts.save("response.mp3")
194
  return "response.mp3"
195
  except Exception as e:
196
  print(f"Error in text-to-speech: {str(e)}")
197
  return None
198
 
199
- # Improved language detection function
200
- def detect_language(text):
201
- lang_codes = {
202
- 'bn': 'Bengali', 'gu': 'Gujarati', 'hi': 'Hindi', 'kn': 'Kannada',
203
- 'ml': 'Malayalam', 'mr': 'Marathi', 'or': 'Oriya', 'pa': 'Punjabi',
204
- 'ta': 'Tamil', 'te': 'Telugu', 'en': 'English'
205
- }
206
-
207
- try:
208
- detected_lang = detect(text)
209
- return detected_lang if detected_lang in lang_codes else 'en'
210
- except:
211
- # Fallback to simple script-based detection
212
- for code, lang in lang_codes.items():
213
- if any(ord(char) >= 0x0900 and ord(char) <= 0x097F for char in text): # Devanagari script
214
- return 'hi'
215
- return 'en' # Default to English if no Indic script is detected
216
-
217
  @spaces.GPU
218
  def indic_vision_assistant(input_type, audio_input, text_input, image_input):
219
  try:
220
- whisper_processor, whisper_model = load_whisper()
221
- sarvam_pipe = load_sarvam()
222
- vision_model, processor = load_vision_model()
223
-
224
  if input_type == "audio" and audio_input is not None:
225
- transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
226
  elif input_type == "text" and text_input:
227
  transcription = text_input
228
  elif input_type == "image" and image_input is not None:
229
- # Use a default prompt if no text input is provided
230
  text_prompt = text_input if text_input else "Describe this image in detail."
231
- transcription = process_image_input(image_input, text_prompt, vision_model, processor)
232
  else:
233
  return "Please provide either audio, text, or image input.", "No input provided.", None
234
 
235
- response = generate_response(transcription, sarvam_pipe)
236
- lang = detect_language(response)
237
  audio_response = text_to_speech(response, lang)
238
 
239
  return transcription, response, audio_response
@@ -241,7 +104,6 @@ def indic_vision_assistant(input_type, audio_input, text_input, image_input):
241
  error_message = f"An error occurred: {str(e)}"
242
  return error_message, error_message, None
243
 
244
-
245
  # Custom CSS
246
  custom_css = """
247
  body {
@@ -266,33 +128,7 @@ body {
266
  #custom-header h1 .pink {
267
  color: #f472b6;
268
  }
269
- #custom-header h2 {@spaces.GPU
270
- def indic_vision_assistant(input_type, audio_input, text_input, image_input):
271
- try:
272
- whisper_processor, whisper_model = load_whisper()
273
- sarvam_pipe = load_sarvam()
274
- vision_model, processor = load_vision_model()
275
-
276
- if input_type == "audio" and audio_input is not None:
277
- transcription = process_audio_input(audio_input, whisper_processor, whisper_model)
278
- elif input_type == "text" and text_input:
279
- transcription = text_input
280
- elif input_type == "image" and image_input is not None:
281
- # Use a default prompt if no text input is provided
282
- text_prompt = text_input if text_input else "Describe this image in detail."
283
- transcription = process_image_input(image_input, text_prompt, vision_model, processor)
284
- else:
285
- return "Please provide either audio, text, or image input.", "No input provided.", None
286
-
287
- response = generate_response(transcription, sarvam_pipe)
288
- lang = detect_language(response)
289
- audio_response = text_to_speech(response, lang)
290
-
291
- return transcription, response, audio_response
292
- except Exception as e:
293
- error_message = f"An error occurred: {str(e)}"
294
- return error_message, error_message, None
295
-
296
  font-size: 1.5rem;
297
  color: #94a3b8;
298
  }
@@ -371,7 +207,8 @@ custom_suggestions = """
371
  </div>
372
  </div>
373
  """
374
- # Update the Gradio interface to allow text input for image processing
 
375
  with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
376
  body_background_fill="#0b0f19",
377
  body_text_color="#e2e8f0",
@@ -405,5 +242,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
405
  outputs=[output_transcription, output_response, output_audio]
406
  )
407
  gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
 
408
  # Launch the app
409
  iface.launch()
 
 
 
 
 
1
  import torch
2
  import librosa
3
+ from transformers import AutoModelForCausalLM, AutoProcessor, pipeline, WhisperProcessor, WhisperForConditionalGeneration
4
  from gtts import gTTS
5
  import gradio as gr
6
+ import spaces
7
  from PIL import Image
8
  import os
 
 
 
 
 
9
  from langdetect import detect
10
+ import subprocess
 
11
 
12
  # Install flash-attn
13
  subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
14
 
15
+ print("Loading models...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # Vision model
18
+ vision_model_id = "microsoft/Phi-3.5-vision-instruct"
19
+ vision_model = AutoModelForCausalLM.from_pretrained(
20
+ vision_model_id,
21
+ trust_remote_code=True,
22
+ torch_dtype=torch.float16,
23
+ use_flash_attention_2=False
24
+ )
25
+ vision_processor = AutoProcessor.from_pretrained(vision_model_id, trust_remote_code=True, num_crops=16)
26
 
27
+ # Whisper model
28
+ whisper_model_id = "openai/whisper-small"
29
+ whisper_processor = WhisperProcessor.from_pretrained(whisper_model_id)
30
+ whisper_model = WhisperForConditionalGeneration.from_pretrained(whisper_model_id)
31
 
32
+ # Sarvam model
33
+ sarvam_pipe = pipeline('sarvamai/sarvam-2b-v0.5')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ print("All models loaded successfully")
36
 
 
37
  @spaces.GPU
38
+ def process_audio_input(audio):
 
 
 
39
  try:
40
+ whisper_model.to('cuda')
41
  audio, sr = librosa.load(audio, sr=16000)
42
+ input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to('cuda')
43
  predicted_ids = whisper_model.generate(input_features)
44
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
45
+ whisper_model.to('cpu')
46
  return transcription
47
  except Exception as e:
48
  return f"Error processing audio: {str(e)}. Please type your message instead."
49
 
 
 
50
  @spaces.GPU
51
+ def process_image_input(image, text_prompt):
 
 
 
52
  try:
53
+ vision_model.to('cuda')
54
+ messages = [
55
+ {"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
56
+ ]
57
+ prompt = vision_processor.tokenizer.apply_chat_template(
58
+ messages, tokenize=False, add_generation_prompt=True
59
+ )
60
+ inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda")
61
+ generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
62
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
63
+ response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
64
+ vision_model.to('cpu')
65
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except Exception as e:
67
  return f"Error processing image: {str(e)}"
68
 
69
+ def generate_response(transcription):
 
 
 
 
 
70
  try:
 
71
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
72
  return response
73
  except Exception as e:
74
  return f"Error generating response: {str(e)}"
75
 
 
76
  def text_to_speech(text, lang='hi'):
77
  try:
78
+ tts = gTTS(text=text, lang=lang, tld='co.in')
 
 
 
 
 
 
 
 
 
 
79
  tts.save("response.mp3")
80
  return "response.mp3"
81
  except Exception as e:
82
  print(f"Error in text-to-speech: {str(e)}")
83
  return None
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  @spaces.GPU
86
  def indic_vision_assistant(input_type, audio_input, text_input, image_input):
87
  try:
 
 
 
 
88
  if input_type == "audio" and audio_input is not None:
89
+ transcription = process_audio_input(audio_input)
90
  elif input_type == "text" and text_input:
91
  transcription = text_input
92
  elif input_type == "image" and image_input is not None:
 
93
  text_prompt = text_input if text_input else "Describe this image in detail."
94
+ transcription = process_image_input(image_input, text_prompt)
95
  else:
96
  return "Please provide either audio, text, or image input.", "No input provided.", None
97
 
98
+ response = generate_response(transcription)
99
+ lang = detect(response)
100
  audio_response = text_to_speech(response, lang)
101
 
102
  return transcription, response, audio_response
 
104
  error_message = f"An error occurred: {str(e)}"
105
  return error_message, error_message, None
106
 
 
107
  # Custom CSS
108
  custom_css = """
109
  body {
 
128
  #custom-header h1 .pink {
129
  color: #f472b6;
130
  }
131
+ #custom-header h2 {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  font-size: 1.5rem;
133
  color: #94a3b8;
134
  }
 
207
  </div>
208
  </div>
209
  """
210
+
211
+ # Gradio interface
212
  with gr.Blocks(css=custom_css, theme=gr.themes.Base().set(
213
  body_background_fill="#0b0f19",
214
  body_text_color="#e2e8f0",
 
242
  outputs=[output_transcription, output_response, output_audio]
243
  )
244
  gr.HTML("<footer>Powered by Indic Language AI with Vision Capabilities</footer>")
245
+
246
  # Launch the app
247
  iface.launch()