Spaces:

sagar007
/

Lava_phi_model

Sleeping

App Files Files Community

sagar007 commited on Jan 3

Commit

94ee0c6

verified ·

1 Parent(s): 066eb01

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -33

app.py CHANGED Viewed

@@ -1,36 +1,34 @@
 import gradio as gr
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
 from PIL import Image
 import logging
 import spaces
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 class LLaVAPhiModel:
     def __init__(self, model_id="sagar007/Lava_phi"):
-        self.device = "cuda"  # Always use cuda with ZeroGPU
         self.model_id = model_id
         logging.info("Initializing LLaVA-Phi model...")
-        # Initialize tokenizer (can be done outside GPU context)
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         try:
-            # Initialize processor (can be done outside GPU context)
-            self.processor = AutoProcessor.from_pretrained("huggingface/clip-vit-base-patch32")
         except Exception as e:
-            logging.warning(f"Failed to load CLIP processor: {str(e)}")
-            # Fallback to basic tokenizer if needed
             self.processor = None
-        # Store conversation history
         self.history = []
-        # Lazy loading of models - will be initialized in GPU context
         self.model = None
         self.clip = None
@@ -38,7 +36,7 @@ class LLaVAPhiModel:
     def ensure_models_loaded(self):
         """Ensure models are loaded in GPU context"""
         if self.model is None:
-            # Load main model
             from transformers import BitsAndBytesConfig
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
@@ -47,34 +45,37 @@ class LLaVAPhiModel:
                 bnb_4bit_quant_type="nf4"
             )
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_id,
-                quantization_config=quantization_config,
-                device_map="auto",
-                torch_dtype=torch.bfloat16,
-                trust_remote_code=True
-            )
-            self.model.config.pad_token_id = self.tokenizer.eos_token_id
         if self.clip is None:
-            # Load CLIP model if not already loaded
-            if self.clip is None:
-                try:
-                    self.clip = AutoModel.from_pretrained("huggingface/clip-vit-base-patch32").to(self.device)
-                except Exception as e:
-                    logging.warning(f"Failed to load CLIP model: {str(e)}")
-                    self.clip = None
     @spaces.GPU
     def process_image(self, image):
-        """Process image through CLIP if available, otherwise return None"""
         try:
-            # Ensure models are loaded
             self.ensure_models_loaded()
-            # If CLIP isn't available, return None
             if self.clip is None or self.processor is None:
-                logging.warning("CLIP model or processor not available - skipping image processing")
                 return None
             # Convert image to correct format
@@ -83,12 +84,18 @@ class LLaVAPhiModel:
             elif isinstance(image, numpy.ndarray):
                 image = Image.fromarray(image)
             with torch.no_grad():
                 try:
                     image_inputs = self.processor(images=image, return_tensors="pt")
                     image_features = self.clip.get_image_features(
                         pixel_values=image_inputs.pixel_values.to(self.device)
                     )
                     return image_features
                 except Exception as e:
                     logging.error(f"Error during image processing: {str(e)}")
@@ -97,10 +104,9 @@ class LLaVAPhiModel:
             logging.error(f"Error in process_image: {str(e)}")
             return None
-    @spaces.GPU(duration=120)  # Set longer duration for generation
     def generate_response(self, message, image=None):
         try:
-            # Ensure models are loaded
             self.ensure_models_loaded()
             if image is not None:
@@ -176,6 +182,7 @@ class LLaVAPhiModel:
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             if "gpt:" in response:
                 response = response.split("gpt:")[-1].strip()
             if "human:" in response:
@@ -190,7 +197,7 @@ class LLaVAPhiModel:
             logging.error(f"Error generating response: {str(e)}")
             logging.error(f"Full traceback:", exc_info=True)
             return f"Error: {str(e)}"
     def clear_history(self):
         self.history = []
         return None

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPProcessor, CLIPModel
 from PIL import Image
 import logging
 import spaces
+import numpy
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 class LLaVAPhiModel:
     def __init__(self, model_id="sagar007/Lava_phi"):
+        self.device = "cuda"
         self.model_id = model_id
         logging.info("Initializing LLaVA-Phi model...")
+        # Initialize tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained(model_id)
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         try:
+            # Use CLIPProcessor directly instead of AutoProcessor
+            self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+            logging.info("Successfully loaded CLIP processor")
         except Exception as e:
+            logging.error(f"Failed to load CLIP processor: {str(e)}")
             self.processor = None
         self.history = []
         self.model = None
         self.clip = None
     def ensure_models_loaded(self):
         """Ensure models are loaded in GPU context"""
         if self.model is None:
+            # Load main model with updated quantization config
             from transformers import BitsAndBytesConfig
             quantization_config = BitsAndBytesConfig(
                 load_in_4bit=True,
                 bnb_4bit_quant_type="nf4"
             )
+            try:
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    self.model_id,
+                    quantization_config=quantization_config,
+                    device_map="auto",
+                    torch_dtype=torch.bfloat16,
+                    trust_remote_code=True
+                )
+                self.model.config.pad_token_id = self.tokenizer.eos_token_id
+                logging.info("Successfully loaded main model")
+            except Exception as e:
+                logging.error(f"Failed to load main model: {str(e)}")
+                raise
         if self.clip is None:
+            try:
+                # Use CLIPModel directly instead of AutoModel
+                self.clip = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
+                logging.info("Successfully loaded CLIP model")
+            except Exception as e:
+                logging.error(f"Failed to load CLIP model: {str(e)}")
+                self.clip = None
     @spaces.GPU
     def process_image(self, image):
+        """Process image through CLIP if available"""
         try:
             self.ensure_models_loaded()
             if self.clip is None or self.processor is None:
+                logging.warning("CLIP model or processor not available")
                 return None
             # Convert image to correct format
             elif isinstance(image, numpy.ndarray):
                 image = Image.fromarray(image)
+            # Ensure image is in RGB mode
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
             with torch.no_grad():
                 try:
+                    # Process image with error handling
                     image_inputs = self.processor(images=image, return_tensors="pt")
                     image_features = self.clip.get_image_features(
                         pixel_values=image_inputs.pixel_values.to(self.device)
                     )
+                    logging.info("Successfully processed image through CLIP")
                     return image_features
                 except Exception as e:
                     logging.error(f"Error during image processing: {str(e)}")
             logging.error(f"Error in process_image: {str(e)}")
             return None
+    @spaces.GPU(duration=120)
     def generate_response(self, message, image=None):
         try:
             self.ensure_models_loaded()
             if image is not None:
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            # Clean up response
             if "gpt:" in response:
                 response = response.split("gpt:")[-1].strip()
             if "human:" in response:
             logging.error(f"Error generating response: {str(e)}")
             logging.error(f"Full traceback:", exc_info=True)
             return f"Error: {str(e)}"
     def clear_history(self):
         self.history = []
         return None