sagar007 commited on
Commit
8ec9ef4
·
verified ·
1 Parent(s): 2144e66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -65
app.py CHANGED
@@ -3,71 +3,69 @@ import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
4
  from PIL import Image
5
  import logging
 
6
 
7
  # Setup logging
8
  logging.basicConfig(level=logging.INFO)
9
 
10
  class LLaVAPhiModel:
11
  def __init__(self, model_id="sagar007/Lava_phi"):
12
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
- logging.info(f"Using device: {self.device}")
 
14
 
15
- try:
16
- # Load model with appropriate settings based on available hardware
17
- logging.info(f"Loading model from {model_id}...")
18
-
19
- # Determine model loading configuration
20
- model_kwargs = {
21
- "device_map": "auto",
22
- "trust_remote_code": True
23
- }
24
-
25
- # Add quantization only if CUDA is available
26
- if torch.cuda.is_available():
27
- from transformers import BitsAndBytesConfig
28
- quantization_config = BitsAndBytesConfig(
29
- load_in_4bit=True,
30
- bnb_4bit_compute_dtype=torch.float16,
31
- bnb_4bit_use_double_quant=True,
32
- bnb_4bit_quant_type="nf4"
33
- )
34
- model_kwargs["quantization_config"] = quantization_config
35
- model_kwargs["torch_dtype"] = torch.bfloat16
36
- else:
37
- # For CPU, use lighter configuration
38
- model_kwargs["torch_dtype"] = torch.float32
 
 
 
39
 
40
  self.model = AutoModelForCausalLM.from_pretrained(
41
- model_id,
42
- **model_kwargs
 
 
 
43
  )
44
-
45
- self.tokenizer = AutoTokenizer.from_pretrained(model_id)
46
-
47
- # Set up padding token
48
- if self.tokenizer.pad_token is None:
49
- self.tokenizer.pad_token = self.tokenizer.eos_token
50
- self.model.config.pad_token_id = self.tokenizer.eos_token_id
51
-
52
- # Load CLIP model and processor
53
- logging.info("Loading CLIP model and processor...")
54
- self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
55
  self.clip = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
56
-
57
- # Store conversation history
58
- self.history = []
59
-
60
- except Exception as e:
61
- logging.error(f"Error initializing model: {str(e)}")
62
- raise
63
 
 
64
  def process_image(self, image):
65
  """Process image through CLIP"""
66
  try:
67
- # Ensure image is in correct format
68
- if isinstance(image, str): # If image path is provided
 
 
 
69
  image = Image.open(image)
70
- elif isinstance(image, numpy.ndarray): # If numpy array (from gradio)
71
  image = Image.fromarray(image)
72
 
73
  with torch.no_grad():
@@ -79,12 +77,15 @@ class LLaVAPhiModel:
79
  except Exception as e:
80
  logging.error(f"Error processing image: {str(e)}")
81
  raise
82
-
 
83
  def generate_response(self, message, image=None):
84
  try:
 
 
 
85
  if image is not None:
86
  try:
87
- # Get image features
88
  image_features = self.process_image(image)
89
  has_image = True
90
  except Exception as e:
@@ -93,17 +94,12 @@ class LLaVAPhiModel:
93
  has_image = False
94
  message = f"Note: Failed to process image. Continuing with text only. Error: {str(e)}\n{message}"
95
 
96
- # Format prompt
97
  prompt = f"human: {'<image>' if has_image else ''}\n{message}\ngpt:"
98
-
99
- # Add context from history
100
  context = ""
101
  for turn in self.history[-3:]:
102
  context += f"human: {turn[0]}\ngpt: {turn[1]}\n"
103
 
104
  full_prompt = context + prompt
105
-
106
- # Prepare text inputs
107
  inputs = self.tokenizer(
108
  full_prompt,
109
  return_tensors="pt",
@@ -113,11 +109,9 @@ class LLaVAPhiModel:
113
  )
114
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
115
 
116
- # Add image features to inputs if available
117
  if has_image:
118
  inputs["image_features"] = image_features
119
 
120
- # Generate response
121
  with torch.no_grad():
122
  outputs = self.model.generate(
123
  **inputs,
@@ -134,7 +128,6 @@ class LLaVAPhiModel:
134
  eos_token_id=self.tokenizer.eos_token_id
135
  )
136
  else:
137
- # Text-only response
138
  prompt = f"human: {message}\ngpt:"
139
  context = ""
140
  for turn in self.history[-3:]:
@@ -166,10 +159,8 @@ class LLaVAPhiModel:
166
  eos_token_id=self.tokenizer.eos_token_id
167
  )
168
 
169
- # Decode response
170
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
171
 
172
- # Clean up response
173
  if "gpt:" in response:
174
  response = response.split("gpt:")[-1].strip()
175
  if "human:" in response:
@@ -177,9 +168,7 @@ class LLaVAPhiModel:
177
  if "<image>" in response:
178
  response = response.replace("<image>", "").strip()
179
 
180
- # Update history
181
  self.history.append((message, response))
182
-
183
  return response
184
 
185
  except Exception as e:
@@ -193,13 +182,12 @@ class LLaVAPhiModel:
193
 
194
  def create_demo():
195
  try:
196
- # Initialize model
197
  model = LLaVAPhiModel()
198
 
199
  with gr.Blocks(css="footer {visibility: hidden}") as demo:
200
  gr.Markdown(
201
  """
202
- # LLaVA-Phi Demo
203
  Chat with a vision-language model that can understand both text and images.
204
  """
205
  )
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
4
  from PIL import Image
5
  import logging
6
+ import spaces
7
 
8
  # Setup logging
9
  logging.basicConfig(level=logging.INFO)
10
 
11
  class LLaVAPhiModel:
12
  def __init__(self, model_id="sagar007/Lava_phi"):
13
+ self.device = "cuda" # Always use cuda with ZeroGPU
14
+ self.model_id = model_id
15
+ logging.info("Initializing LLaVA-Phi model...")
16
 
17
+ # Initialize tokenizer (can be done outside GPU context)
18
+ self.tokenizer = AutoTokenizer.from_pretrained(model_id)
19
+ if self.tokenizer.pad_token is None:
20
+ self.tokenizer.pad_token = self.tokenizer.eos_token
21
+
22
+ # Initialize processor (can be done outside GPU context)
23
+ self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
24
+
25
+ # Store conversation history
26
+ self.history = []
27
+
28
+ # Lazy loading of models - will be initialized in GPU context
29
+ self.model = None
30
+ self.clip = None
31
+
32
+ @spaces.GPU
33
+ def ensure_models_loaded(self):
34
+ """Ensure models are loaded in GPU context"""
35
+ if self.model is None:
36
+ # Load main model
37
+ from transformers import BitsAndBytesConfig
38
+ quantization_config = BitsAndBytesConfig(
39
+ load_in_4bit=True,
40
+ bnb_4bit_compute_dtype=torch.float16,
41
+ bnb_4bit_use_double_quant=True,
42
+ bnb_4bit_quant_type="nf4"
43
+ )
44
 
45
  self.model = AutoModelForCausalLM.from_pretrained(
46
+ self.model_id,
47
+ quantization_config=quantization_config,
48
+ device_map="auto",
49
+ torch_dtype=torch.bfloat16,
50
+ trust_remote_code=True
51
  )
52
+ self.model.config.pad_token_id = self.tokenizer.eos_token_id
53
+
54
+ if self.clip is None:
55
+ # Load CLIP model
 
 
 
 
 
 
 
56
  self.clip = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
 
 
 
 
 
 
 
57
 
58
+ @spaces.GPU
59
  def process_image(self, image):
60
  """Process image through CLIP"""
61
  try:
62
+ # Ensure models are loaded
63
+ self.ensure_models_loaded()
64
+
65
+ # Convert image to correct format
66
+ if isinstance(image, str):
67
  image = Image.open(image)
68
+ elif isinstance(image, numpy.ndarray):
69
  image = Image.fromarray(image)
70
 
71
  with torch.no_grad():
 
77
  except Exception as e:
78
  logging.error(f"Error processing image: {str(e)}")
79
  raise
80
+
81
+ @spaces.GPU(duration=120) # Set longer duration for generation
82
  def generate_response(self, message, image=None):
83
  try:
84
+ # Ensure models are loaded
85
+ self.ensure_models_loaded()
86
+
87
  if image is not None:
88
  try:
 
89
  image_features = self.process_image(image)
90
  has_image = True
91
  except Exception as e:
 
94
  has_image = False
95
  message = f"Note: Failed to process image. Continuing with text only. Error: {str(e)}\n{message}"
96
 
 
97
  prompt = f"human: {'<image>' if has_image else ''}\n{message}\ngpt:"
 
 
98
  context = ""
99
  for turn in self.history[-3:]:
100
  context += f"human: {turn[0]}\ngpt: {turn[1]}\n"
101
 
102
  full_prompt = context + prompt
 
 
103
  inputs = self.tokenizer(
104
  full_prompt,
105
  return_tensors="pt",
 
109
  )
110
  inputs = {k: v.to(self.device) for k, v in inputs.items()}
111
 
 
112
  if has_image:
113
  inputs["image_features"] = image_features
114
 
 
115
  with torch.no_grad():
116
  outputs = self.model.generate(
117
  **inputs,
 
128
  eos_token_id=self.tokenizer.eos_token_id
129
  )
130
  else:
 
131
  prompt = f"human: {message}\ngpt:"
132
  context = ""
133
  for turn in self.history[-3:]:
 
159
  eos_token_id=self.tokenizer.eos_token_id
160
  )
161
 
 
162
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
163
 
 
164
  if "gpt:" in response:
165
  response = response.split("gpt:")[-1].strip()
166
  if "human:" in response:
 
168
  if "<image>" in response:
169
  response = response.replace("<image>", "").strip()
170
 
 
171
  self.history.append((message, response))
 
172
  return response
173
 
174
  except Exception as e:
 
182
 
183
  def create_demo():
184
  try:
 
185
  model = LLaVAPhiModel()
186
 
187
  with gr.Blocks(css="footer {visibility: hidden}") as demo:
188
  gr.Markdown(
189
  """
190
+ # LLaVA-Phi Demo (ZeroGPU)
191
  Chat with a vision-language model that can understand both text and images.
192
  """
193
  )