sagar007 commited on
Commit
1626444
·
verified ·
1 Parent(s): 106d95c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -16
app.py CHANGED
@@ -6,46 +6,42 @@ import gradio as gr
6
  from PIL import Image
7
  import logging
8
  import os
9
- import spaces
10
 
11
  # Set up logging
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
14
 
15
- # Initialize ZeroGPU
16
- spaces.init()
 
17
 
18
  # Function to safely load pipeline
19
- @spaces.GPU
20
  def load_pipeline(model_name, **kwargs):
21
  try:
22
- return pipeline(model=model_name, device=0, **kwargs)
23
  except Exception as e:
24
  logger.error(f"Error loading {model_name} pipeline: {e}")
25
  return None
26
 
27
  # Load Whisper model for speech recognition
28
- @spaces.GPU
29
  def load_whisper():
30
  try:
31
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
32
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").cuda()
33
  return processor, model
34
  except Exception as e:
35
  logger.error(f"Error loading Whisper model: {e}")
36
  return None, None
37
 
38
  # Load sarvam-2b for text generation
39
- @spaces.GPU
40
  def load_sarvam():
41
  return load_pipeline('sarvamai/sarvam-2b-v0.5')
42
 
43
  # Load vision model
44
- @spaces.GPU
45
  def load_vision_model():
46
  try:
47
  model_id = "microsoft/Phi-3.5-vision-instruct"
48
- model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").cuda().eval()
49
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
50
  return model, processor
51
  except Exception as e:
@@ -53,14 +49,13 @@ def load_vision_model():
53
  return None, None
54
 
55
  # Process audio input
56
- @spaces.GPU
57
  def process_audio_input(audio, whisper_processor, whisper_model):
58
  if whisper_processor is None or whisper_model is None:
59
  return "Error: Speech recognition model is not available. Please type your message instead."
60
 
61
  try:
62
  audio, sr = librosa.load(audio, sr=16000)
63
- input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.cuda()
64
  predicted_ids = whisper_model.generate(input_features)
65
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
66
  return transcription
@@ -89,7 +84,6 @@ def detect_language(text):
89
  # Implement language detection logic here
90
  return 'en' # Default to English for now
91
 
92
- @spaces.GPU
93
  def generate_response(transcription, sarvam_pipe):
94
  if sarvam_pipe is None:
95
  return "Error: Text generation model is not available."
@@ -102,7 +96,6 @@ def generate_response(transcription, sarvam_pipe):
102
  logger.error(f"Error generating response: {e}")
103
  return f"Error generating response. Please try again."
104
 
105
- @spaces.GPU
106
  def process_image(image, text_input, vision_model, vision_processor):
107
  if vision_model is None or vision_processor is None:
108
  return "Error: Vision model is not available."
@@ -110,7 +103,7 @@ def process_image(image, text_input, vision_model, vision_processor):
110
  try:
111
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
112
  image = Image.fromarray(image).convert("RGB")
113
- inputs = vision_processor(prompt, image, return_tensors="pt").to("cuda")
114
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
115
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
116
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -119,7 +112,6 @@ def process_image(image, text_input, vision_model, vision_processor):
119
  logger.error(f"Error processing image: {e}")
120
  return f"Error processing image. Please try again."
121
 
122
- @spaces.GPU
123
  def multimodal_assistant(input_type, audio_input, text_input, image_input):
124
  try:
125
  # Load models
 
6
  from PIL import Image
7
  import logging
8
  import os
 
9
 
10
  # Set up logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
+ # Check for GPU availability
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ logger.info(f"Using device: {device}")
17
 
18
  # Function to safely load pipeline
 
19
  def load_pipeline(model_name, **kwargs):
20
  try:
21
+ return pipeline(model=model_name, device=device, **kwargs)
22
  except Exception as e:
23
  logger.error(f"Error loading {model_name} pipeline: {e}")
24
  return None
25
 
26
  # Load Whisper model for speech recognition
 
27
  def load_whisper():
28
  try:
29
  processor = WhisperProcessor.from_pretrained("openai/whisper-small")
30
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small").to(device)
31
  return processor, model
32
  except Exception as e:
33
  logger.error(f"Error loading Whisper model: {e}")
34
  return None, None
35
 
36
  # Load sarvam-2b for text generation
 
37
  def load_sarvam():
38
  return load_pipeline('sarvamai/sarvam-2b-v0.5')
39
 
40
  # Load vision model
 
41
  def load_vision_model():
42
  try:
43
  model_id = "microsoft/Phi-3.5-vision-instruct"
44
+ model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype="auto", attn_implementation="flash_attention_2").to(device).eval()
45
  processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
46
  return model, processor
47
  except Exception as e:
 
49
  return None, None
50
 
51
  # Process audio input
 
52
  def process_audio_input(audio, whisper_processor, whisper_model):
53
  if whisper_processor is None or whisper_model is None:
54
  return "Error: Speech recognition model is not available. Please type your message instead."
55
 
56
  try:
57
  audio, sr = librosa.load(audio, sr=16000)
58
+ input_features = whisper_processor(audio, sampling_rate=sr, return_tensors="pt").input_features.to(device)
59
  predicted_ids = whisper_model.generate(input_features)
60
  transcription = whisper_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
61
  return transcription
 
84
  # Implement language detection logic here
85
  return 'en' # Default to English for now
86
 
 
87
  def generate_response(transcription, sarvam_pipe):
88
  if sarvam_pipe is None:
89
  return "Error: Text generation model is not available."
 
96
  logger.error(f"Error generating response: {e}")
97
  return f"Error generating response. Please try again."
98
 
 
99
  def process_image(image, text_input, vision_model, vision_processor):
100
  if vision_model is None or vision_processor is None:
101
  return "Error: Vision model is not available."
 
103
  try:
104
  prompt = f"<|user|>\n<|image_1|>\n{text_input}<|end|>\n<|assistant|>\n"
105
  image = Image.fromarray(image).convert("RGB")
106
+ inputs = vision_processor(prompt, image, return_tensors="pt").to(device)
107
  generate_ids = vision_model.generate(**inputs, max_new_tokens=1000, eos_token_id=vision_processor.tokenizer.eos_token_id)
108
  generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
109
  response = vision_processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
112
  logger.error(f"Error processing image: {e}")
113
  return f"Error processing image. Please try again."
114
 
 
115
  def multimodal_assistant(input_type, audio_input, text_input, image_input):
116
  try:
117
  # Load models