Spaces:

spoorthibhat
/

Llava-Med

Paused

App Files Files Community

spoorthibhat commited on Dec 7, 2024

Commit

f8ba981

verified ·

1 Parent(s): 462afd3

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -96

app.py CHANGED Viewed

@@ -1,132 +1,90 @@
 import os
 import torch
-import warnings
-import gradio as gr
-import io
-from contextlib import redirect_stdout
-from accelerate import Accelerator
-from transformers import AutoTokenizer
-# Set memory-related environment variables
-os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
-# Suppress warnings and optimize CUDA
 warnings.filterwarnings('ignore')
-torch.backends.cudnn.benchmark = True
-torch.backends.cuda.matmul.allow_tf32 = True
-# Suppress specific pip install warnings
-os.system('pip install -q -e .')
-os.system('pip uninstall -y bitsandbytes')
-os.system('pip install bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl')
-# Import LLaVA specific modules
 from llava.model.builder import load_pretrained_model
 from llava.mm_utils import get_model_name_from_path
 from llava.eval.run_llava import eval_model
-# Initialize Accelerator with lower precision
-accelerator = Accelerator(mixed_precision="fp16")
-# Device setup with more robust checking
-def get_optimal_device():
-    if torch.cuda.is_available():
-        # Find GPU with most free memory
-        total_memory = [torch.cuda.get_device_properties(i).total_memory for i in range(torch.cuda.device_count())]
-        free_memory = [torch.cuda.mem_get_info(i)[0] for i in range(torch.cuda.device_count())]
-        best_gpu = free_memory.index(max(free_memory))
-        return torch.device(f'cuda:{best_gpu}')
-    return torch.device('cpu')
-device = get_optimal_device()
 print(f"Using device: {device}")
-# Model loading with memory optimizations
-def load_model_safely(model_path):
-    try:
-        # Clear GPU cache
-        torch.cuda.empty_cache()
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-        # Load model with device mapping
-        tokenizer, model, image_processor, context_len = load_pretrained_model(
-            model_path=model_path,
-            model_base=None,
-            model_name=get_model_name_from_path(model_path),
-            device_map="auto"  # Automatic device distribution
-        )
-        # Enable memory-efficient techniques
-        model.gradient_checkpointing_enable()
-        # Move to device and prepare with accelerator
-        model.to(device)
-        # Optional: Compile with memory-aware mode
-        try:
-            model = torch.compile(model, mode="reduce-overhead")
-        except Exception as compile_error:
-            print(f"Model compilation failed: {compile_error}. Proceeding without compilation.")
-        model = accelerator.prepare(model)
-        return tokenizer, model, image_processor, context_len
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        return None, None, None, None
 # Define the model path
 model_path = "Veda0718/llava-med-v1.5-mistral-7b-finetuned"
-# Load the model with safety checks
-tokenizer, model, image_processor, context_len = load_model_safely(model_path)
-# Inference function with error handling
 def run_inference(image, question):
     if model is None:
         return "Model failed to load. Please check the logs."
-    try:
-        args = type('Args', (), {
-            "model_path": model_path,
-            "model_base": None,
-            "image_file": image,
-            "query": question,
-            "conv_mode": None,
-            "sep": ",",
-            "temperature": 0,
-            "top_p": None,
-            "num_beams": 1,
-            "max_new_tokens": 512
-        })()
-        # Capture the printed output of eval_model
-        f = io.StringIO()
-        with redirect_stdout(f):
-            eval_model(args)
-        output = f.getvalue()
-        return output
-    except Exception as e:
-        return f"Inference error: {str(e)}"
 # Create the Gradio interface
 with gr.Blocks(theme=gr.themes.Monochrome()) as app:
     with gr.Column(scale=1):
         gr.Markdown("<center><h1>LLaVA-Med</h1></center>")
         with gr.Row():
             image = gr.Image(type="filepath", scale=2)
             question = gr.Textbox(placeholder="Enter a question", scale=3)
         with gr.Row():
             answer = gr.Textbox(placeholder="Answer pops up here", scale=1)
         with gr.Row():
             btn = gr.Button("Run Inference", scale=1)
-            btn.click(fn=run_inference, inputs=[image, question], outputs=answer)
 # Launch the app
 if __name__ == "__main__":
-    print("Clearing GPU cache before app launch...")
-    torch.cuda.empty_cache()
     app.queue().launch(debug=True)

 import os
+os.system('pip install -q -e .')
+os.system('pip uninstall bitsandbytes')
+os.system('pip install bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl')
 import torch
+print(torch.cuda.is_available())
+print(os.system('python -m bitsandbytes'))
+import os
+import torch
+import warnings
 warnings.filterwarnings('ignore')
+import io
+from contextlib import redirect_stdout
+import gradio as gr
+from transformers import AutoTokenizer
 from llava.model.builder import load_pretrained_model
 from llava.mm_utils import get_model_name_from_path
 from llava.eval.run_llava import eval_model
+# Check CUDA availability with error handling
+device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Using device: {device}")
 # Define the model path
 model_path = "Veda0718/llava-med-v1.5-mistral-7b-finetuned"
+# Load the model
+try:
+    tokenizer, model, image_processor, context_len = load_pretrained_model(
+        model_path=model_path,
+        model_base=None,
+        model_name=get_model_name_from_path(model_path)
+    )
+    # Move model to appropriate device
+    model = model.to(device)
+except Exception as e:
+    print(f"Error loading model: {e}")
+    tokenizer, model, image_processor, context_len = None, None, None, None
+# Define the inference function
 def run_inference(image, question):
     if model is None:
         return "Model failed to load. Please check the logs."
+    args = type('Args', (), {
+        "model_path": model_path,
+        "model_base": None,
+        "image_file": image,
+        "query": question,
+        "conv_mode": None,
+        "sep": ",",
+        "temperature": 0,
+        "top_p": None,
+        "num_beams": 1,
+        "max_new_tokens": 512
+    })()
+    # Capture the printed output of eval_model
+    f = io.StringIO()
+    with redirect_stdout(f):
+        eval_model(args)
+    output = f.getvalue()
+    return output
 # Create the Gradio interface
 with gr.Blocks(theme=gr.themes.Monochrome()) as app:
     with gr.Column(scale=1):
         gr.Markdown("<center><h1>LLaVA-Med</h1></center>")
         with gr.Row():
             image = gr.Image(type="filepath", scale=2)
             question = gr.Textbox(placeholder="Enter a question", scale=3)
         with gr.Row():
             answer = gr.Textbox(placeholder="Answer pops up here", scale=1)
         with gr.Row():
             btn = gr.Button("Run Inference", scale=1)
+        btn.click(fn=run_inference, inputs=[image, question], outputs=answer)
 # Launch the app
 if __name__ == "__main__":
     app.queue().launch(debug=True)