sagar007 commited on
Commit
62a592a
·
verified ·
1 Parent(s): 0f965de

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -58
app.py CHANGED
@@ -1,51 +1,33 @@
1
- # Import spaces first to avoid CUDA initialization issues
2
  import spaces
3
-
4
- # Then import other libraries
5
  import torch
6
- import librosa
7
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
8
  from gtts import gTTS
9
- import gradio as gr
10
- from PIL import Image
11
- import os
12
  from langdetect import detect
13
- import subprocess
14
 
15
- print("Using GPU for operations when available")
 
 
16
 
17
- # Install flash-attn
18
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
19
-
20
- # Function to safely load pipeline within a GPU-decorated function
21
- @spaces.GPU
22
- def load_pipeline(model_name, **kwargs):
23
- try:
24
- device = 0 if torch.cuda.is_available() else "cpu"
25
- return pipeline(model=model_name, device=device, **kwargs)
26
- except Exception as e:
27
- print(f"Error loading {model_name} pipeline: {e}")
28
- return None
29
 
30
- # Load Whisper model for speech recognition within a GPU-decorated function
31
  @spaces.GPU
32
  def load_whisper():
33
  try:
34
- device = 0 if torch.cuda.is_available() else "cpu"
35
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
36
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
37
  return processor, model
38
  except Exception as e:
39
  print(f"Error loading Whisper model: {e}")
40
  return None, None
41
 
42
- # Load vision model within a GPU-decorated function
43
  @spaces.GPU
44
  def load_vision_model():
45
  try:
46
  model_id = "microsoft/Phi-3.5-vision-instruct"
47
  model = AutoModelForCausalLM.from_pretrained(
48
- model_id, trust_remote_code=True, torch_dtype=torch.float16, use_flash_attention_2=False
49
  )
50
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
51
  return model, processor
@@ -53,43 +35,32 @@ def load_vision_model():
53
  print(f"Error loading vision model: {e}")
54
  return None, None
55
 
56
- # Load sarvam-2b for text generation within a GPU-decorated function
57
  @spaces.GPU
58
  def load_sarvam():
59
- return load_pipeline('sarvamai/sarvam-2b-v0.5')
60
-
61
- # Load all models
62
- whisper_processor, whisper_model = load_whisper()
63
- vision_model, vision_processor = load_vision_model()
64
- sarvam_pipe = load_sarvam()
65
 
66
  @spaces.GPU
67
- def process_audio_input(audio):
68
- if whisper_processor is None or whisper_model is None:
69
- return "Error: Speech recognition model is not available. Please type your message instead."
70
-
71
  try:
72
- audio, sr = librosa.load(audio, sr=16000)
73
- input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(whisper_model.device)
74
  predicted_ids = whisper_model.generate(input_features)
75
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
76
  return transcription
77
  except Exception as e:
78
- return f"Error processing audio: {str(e)}. Please type your message instead."
79
 
80
  @spaces.GPU
81
- def process_image_input(image, text_prompt):
82
- if vision_model is None or vision_processor is None:
83
- return "Error: Vision model is not available."
84
-
85
  try:
86
- messages = [
87
- {"role": "user", "content": f"{text_prompt}\n<|image_1|>"},
88
- ]
89
- prompt = vision_processor.tokenizer.apply_chat_template(
90
- messages, tokenize=False, add_generation_prompt=True
91
- )
92
- inputs = vision_processor(prompt, image, return_tensors="pt").to(vision_model.device)
93
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
94
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
95
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -97,10 +68,8 @@ def process_image_input(image, text_prompt):
97
  except Exception as e:
98
  return f"Error processing image: {str(e)}"
99
 
100
- def generate_response(transcription):
101
- if sarvam_pipe is None:
102
- return "Error: Text generation model is not available."
103
-
104
  try:
105
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
106
  return response
@@ -119,17 +88,21 @@ def text_to_speech(text, lang='hi'):
119
  @spaces.GPU
120
  def indic_vision_assistant(input_type, audio_input, text_input, image_input):
121
  try:
 
 
 
 
122
  if input_type == "audio" and audio_input is not None:
123
- transcription = process_audio_input(audio_input)
124
  elif input_type == "text" and text_input:
125
  transcription = text_input
126
  elif input_type == "image" and image_input is not None:
127
  text_prompt = text_input if text_input else "Describe this image in detail."
128
- transcription = process_image_input(image_input, text_prompt)
129
  else:
130
  return "Please provide either audio, text, or image input.", "No input provided.", None
131
 
132
- response = generate_response(transcription)
133
  lang = detect(response)
134
  audio_response = text_to_speech(response, lang)
135
 
 
 
1
  import spaces
2
+ import gradio as gr
 
3
  import torch
4
+ import os
5
  from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoProcessor
6
  from gtts import gTTS
 
 
 
7
  from langdetect import detect
 
8
 
9
+ # Disable CUDA initialization at import
10
+ os.environ['CUDA_VISIBLE_DEVICES'] = ''
11
+ torch.set_grad_enabled(False)
12
 
13
+ print("CUDA initialization disabled at import")
 
 
 
 
 
 
 
 
 
 
 
14
 
 
15
  @spaces.GPU
16
  def load_whisper():
17
  try:
 
18
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
19
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
20
  return processor, model
21
  except Exception as e:
22
  print(f"Error loading Whisper model: {e}")
23
  return None, None
24
 
 
25
  @spaces.GPU
26
  def load_vision_model():
27
  try:
28
  model_id = "microsoft/Phi-3.5-vision-instruct"
29
  model = AutoModelForCausalLM.from_pretrained(
30
+ model_id, trust_remote_code=True, torch_dtype=torch.float16
31
  )
32
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=16)
33
  return model, processor
 
35
  print(f"Error loading vision model: {e}")
36
  return None, None
37
 
 
38
  @spaces.GPU
39
  def load_sarvam():
40
+ try:
41
+ return pipeline('sarvamai/sarvam-2b-v0.5')
42
+ except Exception as e:
43
+ print(f"Error loading Sarvam model: {e}")
44
+ return None
 
45
 
46
  @spaces.GPU
47
+ def process_audio(audio_path, whisper_processor, whisper_model):
48
+ import librosa
 
 
49
  try:
50
+ audio, sr = librosa.load(audio_path, sr=16000)
51
+ input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features
52
  predicted_ids = whisper_model.generate(input_features)
53
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
54
  return transcription
55
  except Exception as e:
56
+ return f"Error processing audio: {str(e)}"
57
 
58
  @spaces.GPU
59
+ def process_image(image, text_prompt, vision_model, vision_processor):
 
 
 
60
  try:
61
+ messages = [{"role": "user", "content": f"{text_prompt}\n<|image_1|>"}]
62
+ prompt = vision_processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
63
+ inputs = vision_processor(prompt, image, return_tensors="pt")
 
 
 
 
64
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, temperature=0.2, do_sample=True)
65
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
66
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
68
  except Exception as e:
69
  return f"Error processing image: {str(e)}"
70
 
71
+ @spaces.GPU
72
+ def generate_response(transcription, sarvam_pipe):
 
 
73
  try:
74
  response = sarvam_pipe(transcription, max_length=100, num_return_sequences=1)[0]['generated_text']
75
  return response
 
88
  @spaces.GPU
89
  def indic_vision_assistant(input_type, audio_input, text_input, image_input):
90
  try:
91
+ whisper_processor, whisper_model = load_whisper()
92
+ vision_model, vision_processor = load_vision_model()
93
+ sarvam_pipe = load_sarvam()
94
+
95
  if input_type == "audio" and audio_input is not None:
96
+ transcription = process_audio(audio_input, whisper_processor, whisper_model)
97
  elif input_type == "text" and text_input:
98
  transcription = text_input
99
  elif input_type == "image" and image_input is not None:
100
  text_prompt = text_input if text_input else "Describe this image in detail."
101
+ transcription = process_image(image_input, text_prompt, vision_model, vision_processor)
102
  else:
103
  return "Please provide either audio, text, or image input.", "No input provided.", None
104
 
105
+ response = generate_response(transcription, sarvam_pipe)
106
  lang = detect(response)
107
  audio_response = text_to_speech(response, lang)
108