Spaces:

sagar007
/

Lava_phi_model

Sleeping

App Files Files Community

sagar007 commited on Jan 2

Commit

8ec9ef4

verified ·

1 Parent(s): 2144e66

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -65

app.py CHANGED Viewed

@@ -3,71 +3,69 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
 from PIL import Image
 import logging
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 class LLaVAPhiModel:
     def __init__(self, model_id="sagar007/Lava_phi"):
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        logging.info(f"Using device: {self.device}")
-        try:
-            # Load model with appropriate settings based on available hardware
-            logging.info(f"Loading model from {model_id}...")
-            # Determine model loading configuration
-            model_kwargs = {
-                "device_map": "auto",
-                "trust_remote_code": True
-            }
-            # Add quantization only if CUDA is available
-            if torch.cuda.is_available():
-                from transformers import BitsAndBytesConfig
-                quantization_config = BitsAndBytesConfig(
-                    load_in_4bit=True,
-                    bnb_4bit_compute_dtype=torch.float16,
-                    bnb_4bit_use_double_quant=True,
-                    bnb_4bit_quant_type="nf4"
-                )
-                model_kwargs["quantization_config"] = quantization_config
-                model_kwargs["torch_dtype"] = torch.bfloat16
-            else:
-                # For CPU, use lighter configuration
-                model_kwargs["torch_dtype"] = torch.float32
             self.model = AutoModelForCausalLM.from_pretrained(
-                model_id,
-                **model_kwargs
             )
-            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
-            # Set up padding token
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
-                self.model.config.pad_token_id = self.tokenizer.eos_token_id
-            # Load CLIP model and processor
-            logging.info("Loading CLIP model and processor...")
-            self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
             self.clip = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
-            # Store conversation history
-            self.history = []
-        except Exception as e:
-            logging.error(f"Error initializing model: {str(e)}")
-            raise
     def process_image(self, image):
         """Process image through CLIP"""
         try:
-            # Ensure image is in correct format
-            if isinstance(image, str):  # If image path is provided
                 image = Image.open(image)
-            elif isinstance(image, numpy.ndarray):  # If numpy array (from gradio)
                 image = Image.fromarray(image)
             with torch.no_grad():
@@ -79,12 +77,15 @@ class LLaVAPhiModel:
         except Exception as e:
             logging.error(f"Error processing image: {str(e)}")
             raise
     def generate_response(self, message, image=None):
         try:
             if image is not None:
                 try:
-                    # Get image features
                     image_features = self.process_image(image)
                     has_image = True
                 except Exception as e:
@@ -93,17 +94,12 @@ class LLaVAPhiModel:
                     has_image = False
                     message = f"Note: Failed to process image. Continuing with text only. Error: {str(e)}\n{message}"
-                # Format prompt
                 prompt = f"human: {'<image>' if has_image else ''}\n{message}\ngpt:"
-                # Add context from history
                 context = ""
                 for turn in self.history[-3:]:
                     context += f"human: {turn[0]}\ngpt: {turn[1]}\n"
                 full_prompt = context + prompt
-                # Prepare text inputs
                 inputs = self.tokenizer(
                     full_prompt,
                     return_tensors="pt",
@@ -113,11 +109,9 @@ class LLaVAPhiModel:
                 )
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                # Add image features to inputs if available
                 if has_image:
                     inputs["image_features"] = image_features
-                # Generate response
                 with torch.no_grad():
                     outputs = self.model.generate(
                         **inputs,
@@ -134,7 +128,6 @@ class LLaVAPhiModel:
                         eos_token_id=self.tokenizer.eos_token_id
                     )
             else:
-                # Text-only response
                 prompt = f"human: {message}\ngpt:"
                 context = ""
                 for turn in self.history[-3:]:
@@ -166,10 +159,8 @@ class LLaVAPhiModel:
                         eos_token_id=self.tokenizer.eos_token_id
                     )
-            # Decode response
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            # Clean up response
             if "gpt:" in response:
                 response = response.split("gpt:")[-1].strip()
             if "human:" in response:
@@ -177,9 +168,7 @@ class LLaVAPhiModel:
             if "<image>" in response:
                 response = response.replace("<image>", "").strip()
-            # Update history
             self.history.append((message, response))
             return response
         except Exception as e:
@@ -193,13 +182,12 @@ class LLaVAPhiModel:
 def create_demo():
     try:
-        # Initialize model
         model = LLaVAPhiModel()
         with gr.Blocks(css="footer {visibility: hidden}") as demo:
             gr.Markdown(
                 """
-                # LLaVA-Phi Demo
                 Chat with a vision-language model that can understand both text and images.
                 """
             )

 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModel
 from PIL import Image
 import logging
+import spaces
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 class LLaVAPhiModel:
     def __init__(self, model_id="sagar007/Lava_phi"):
+        self.device = "cuda"  # Always use cuda with ZeroGPU
+        self.model_id = model_id
+        logging.info("Initializing LLaVA-Phi model...")
+        # Initialize tokenizer (can be done outside GPU context)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Initialize processor (can be done outside GPU context)
+        self.processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        # Store conversation history
+        self.history = []
+        # Lazy loading of models - will be initialized in GPU context
+        self.model = None
+        self.clip = None
+    @spaces.GPU
+    def ensure_models_loaded(self):
+        """Ensure models are loaded in GPU context"""
+        if self.model is None:
+            # Load main model
+            from transformers import BitsAndBytesConfig
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4"
+            )
             self.model = AutoModelForCausalLM.from_pretrained(
+                self.model_id,
+                quantization_config=quantization_config,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True
             )
+            self.model.config.pad_token_id = self.tokenizer.eos_token_id
+        if self.clip is None:
+            # Load CLIP model
             self.clip = AutoModel.from_pretrained("openai/clip-vit-base-patch32").to(self.device)
+    @spaces.GPU
     def process_image(self, image):
         """Process image through CLIP"""
         try:
+            # Ensure models are loaded
+            self.ensure_models_loaded()
+            # Convert image to correct format
+            if isinstance(image, str):
                 image = Image.open(image)
+            elif isinstance(image, numpy.ndarray):
                 image = Image.fromarray(image)
             with torch.no_grad():
         except Exception as e:
             logging.error(f"Error processing image: {str(e)}")
             raise
+    @spaces.GPU(duration=120)  # Set longer duration for generation
     def generate_response(self, message, image=None):
         try:
+            # Ensure models are loaded
+            self.ensure_models_loaded()
             if image is not None:
                 try:
                     image_features = self.process_image(image)
                     has_image = True
                 except Exception as e:
                     has_image = False
                     message = f"Note: Failed to process image. Continuing with text only. Error: {str(e)}\n{message}"
                 prompt = f"human: {'<image>' if has_image else ''}\n{message}\ngpt:"
                 context = ""
                 for turn in self.history[-3:]:
                     context += f"human: {turn[0]}\ngpt: {turn[1]}\n"
                 full_prompt = context + prompt
                 inputs = self.tokenizer(
                     full_prompt,
                     return_tensors="pt",
                 )
                 inputs = {k: v.to(self.device) for k, v in inputs.items()}
                 if has_image:
                     inputs["image_features"] = image_features
                 with torch.no_grad():
                     outputs = self.model.generate(
                         **inputs,
                         eos_token_id=self.tokenizer.eos_token_id
                     )
             else:
                 prompt = f"human: {message}\ngpt:"
                 context = ""
                 for turn in self.history[-3:]:
                         eos_token_id=self.tokenizer.eos_token_id
                     )
             response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
             if "gpt:" in response:
                 response = response.split("gpt:")[-1].strip()
             if "human:" in response:
             if "<image>" in response:
                 response = response.replace("<image>", "").strip()
             self.history.append((message, response))
             return response
         except Exception as e:
 def create_demo():
     try:
         model = LLaVAPhiModel()
         with gr.Blocks(css="footer {visibility: hidden}") as demo:
             gr.Markdown(
                 """
+                # LLaVA-Phi Demo (ZeroGPU)
                 Chat with a vision-language model that can understand both text and images.
                 """
             )