Spaces:

spoorthibhat
/

Llava-Med

Paused

App Files Files Community

spoorthibhat commited on Dec 7, 2024

Commit

462afd3

verified ·

1 Parent(s): 3fddf2c

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -72

app.py CHANGED Viewed

@@ -1,112 +1,132 @@
-import os
-os.system('pip install -q -e .')
-os.system('pip uninstall bitsandbytes')
-os.system('pip uninstall bitsandbytes-windows')
-os.system('pip install bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl')
-import torch
-print(torch.cuda.is_available())
-print(os.system('python -m bitsandbytes'))
 import os
 import torch
 import warnings
-from accelerate import Accelerator
 import io
 from contextlib import redirect_stdout
-import gradio as gr
 from transformers import AutoTokenizer
 from llava.model.builder import load_pretrained_model
 from llava.mm_utils import get_model_name_from_path
 from llava.eval.run_llava import eval_model
-warnings.filterwarnings('ignore')
-# Initialize Accelerator
-accelerator = Accelerator(mixed_precision="fp16")  # Use "fp16" for half-precision or "bf16" for bfloat16
-# Check GPU availability and define the device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(f"Using device: {device}")
-# Clear GPU cache before loading the model
-print("Clearing GPU cache before model loading...")
-torch.cuda.empty_cache()
 # Define the model path
 model_path = "Veda0718/llava-med-v1.5-mistral-7b-finetuned"
-# Load the model
-try:
-    tokenizer, model, image_processor, context_len = load_pretrained_model(
-        model_path=model_path,
-        model_base=None,
-        model_name=get_model_name_from_path(model_path)
-    )
-    # Enable Gradient Checkpointing
-    model.gradient_checkpointing_enable()
-    # Move the model to the correct device
-    model.to(device)
-    # Compile Model with PyTorch 2.0+
-    print("Compiling the model with torch.compile()...")
-    model = torch.compile(model, mode="max-autotune")  # Optimized for both speed and memory
-    # Prepare Model with Accelerator
-    model = accelerator.prepare(model)
-    print("Model successfully loaded and compiled!")
-except Exception as e:
-    print(f"Error loading model: {e}")
-    tokenizer, model, image_processor, context_len = None, None, None, None
-# Define the inference function
 def run_inference(image, question):
     if model is None:
         return "Model failed to load. Please check the logs."
-    args = type('Args', (), {
-        "model_path": model_path,
-        "model_base": None,
-        "image_file": image,
-        "query": question,
-        "conv_mode": None,
-        "sep": ",",
-        "temperature": 0,
-        "top_p": None,
-        "num_beams": 1,
-        "max_new_tokens": 512
-    })()
-    # Capture the printed output of eval_model
-    f = io.StringIO()
-    with redirect_stdout(f):
-        eval_model(args)
-    output = f.getvalue()
-    return output
 # Create the Gradio interface
 with gr.Blocks(theme=gr.themes.Monochrome()) as app:
     with gr.Column(scale=1):
         gr.Markdown("<center><h1>LLaVA-Med</h1></center>")
         with gr.Row():
             image = gr.Image(type="filepath", scale=2)
             question = gr.Textbox(placeholder="Enter a question", scale=3)
         with gr.Row():
             answer = gr.Textbox(placeholder="Answer pops up here", scale=1)
         with gr.Row():
             btn = gr.Button("Run Inference", scale=1)
-        btn.click(fn=run_inference, inputs=[image, question], outputs=answer)
 # Launch the app
 if __name__ == "__main__":
     print("Clearing GPU cache before app launch...")
     torch.cuda.empty_cache()
     app.queue().launch(debug=True)

 import os
 import torch
 import warnings
+import gradio as gr
 import io
 from contextlib import redirect_stdout
+from accelerate import Accelerator
 from transformers import AutoTokenizer
+# Set memory-related environment variables
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+# Suppress warnings and optimize CUDA
+warnings.filterwarnings('ignore')
+torch.backends.cudnn.benchmark = True
+torch.backends.cuda.matmul.allow_tf32 = True
+# Suppress specific pip install warnings
+os.system('pip install -q -e .')
+os.system('pip uninstall -y bitsandbytes')
+os.system('pip install bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl')
+# Import LLaVA specific modules
 from llava.model.builder import load_pretrained_model
 from llava.mm_utils import get_model_name_from_path
 from llava.eval.run_llava import eval_model
+# Initialize Accelerator with lower precision
+accelerator = Accelerator(mixed_precision="fp16")
+# Device setup with more robust checking
+def get_optimal_device():
+    if torch.cuda.is_available():
+        # Find GPU with most free memory
+        total_memory = [torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())]
+        free_memory = [torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())]
+        best_gpu = free_memory.index(max(free_memory))
+        return torch.device(f'cuda:{best_gpu}')
+    return torch.device('cpu')
+device = get_optimal_device()
 print(f"Using device: {device}")
+# Model loading with memory optimizations
+def load_model_safely(model_path):
+    try:
+        # Clear GPU cache
+        torch.cuda.empty_cache()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        # Load model with device mapping
+        tokenizer, model, image_processor, context_len = load_pretrained_model(
+            model_path=model_path,
+            model_base=None,
+            model_name=get_model_name_from_path(model_path),
+            device_map="auto"  # Automatic device distribution
+        )
+        # Enable memory-efficient techniques
+        model.gradient_checkpointing_enable()
+        # Move to device and prepare with accelerator
+        model.to(device)
+        # Optional: Compile with memory-aware mode
+        try:
+            model = torch.compile(model, mode="reduce-overhead")
+        except Exception as compile_error:
+            print(f"Model compilation failed: {compile_error}. Proceeding without compilation.")
+        model = accelerator.prepare(model)
+        return tokenizer, model, image_processor, context_len
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        return None, None, None, None
 # Define the model path
 model_path = "Veda0718/llava-med-v1.5-mistral-7b-finetuned"
+# Load the model with safety checks
+tokenizer, model, image_processor, context_len = load_model_safely(model_path)
+# Inference function with error handling
 def run_inference(image, question):
     if model is None:
         return "Model failed to load. Please check the logs."
+    try:
+        args = type('Args', (), {
+            "model_path": model_path,
+            "model_base": None,
+            "image_file": image,
+            "query": question,
+            "conv_mode": None,
+            "sep": ",",
+            "temperature": 0,
+            "top_p": None,
+            "num_beams": 1,
+            "max_new_tokens": 512
+        })()
+        # Capture the printed output of eval_model
+        f = io.StringIO()
+        with redirect_stdout(f):
+            eval_model(args)
+        output = f.getvalue()
+        return output
+    except Exception as e:
+        return f"Inference error: {str(e)}"
 # Create the Gradio interface
 with gr.Blocks(theme=gr.themes.Monochrome()) as app:
     with gr.Column(scale=1):
         gr.Markdown("<center><h1>LLaVA-Med</h1></center>")
         with gr.Row():
             image = gr.Image(type="filepath", scale=2)
             question = gr.Textbox(placeholder="Enter a question", scale=3)
         with gr.Row():
             answer = gr.Textbox(placeholder="Answer pops up here", scale=1)
         with gr.Row():
             btn = gr.Button("Run Inference", scale=1)
+            btn.click(fn=run_inference, inputs=[image, question], outputs=answer)
 # Launch the app
 if __name__ == "__main__":
     print("Clearing GPU cache before app launch...")
     torch.cuda.empty_cache()
     app.queue().launch(debug=True)