Spaces:

sounar
/

ContactDoctor-API

Paused

App Files Files Community

sounar commited on Nov 19, 2024

Commit

b37e8c8

verified ·

1 Parent(s): e212182

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -7

app.py CHANGED Viewed

@@ -3,7 +3,15 @@ import torch
 from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
 import gradio as gr
 from PIL import Image
-from torchvision.transforms import ToTensor
 # Get API token from environment variable
 api_token = os.getenv("HF_TOKEN").strip()
@@ -16,15 +24,23 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.float16
 )
 # Initialize model and tokenizer
 model = AutoModel.from_pretrained(
     "ContactDoctor/Bio-Medical-MultiModal-Llama-3-8B-V1",
-    quantization_config=bnb_config,
-    device_map="auto",
-    torch_dtype=torch.float16,
-    trust_remote_code=True,
-    attn_implementation="flash_attention_2",
-    token=api_token
 )
 tokenizer = AutoTokenizer.from_pretrained(
@@ -84,6 +100,11 @@ demo = gr.Interface(
 # Launch the Gradio app
 if __name__ == "__main__":
     demo.launch(
         share=True,
         server_name="0.0.0.0",

 from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
 import gradio as gr
 from PIL import Image
+# First, let's check if flash-attn is installed
+try:
+    import flash_attn
+    FLASH_ATTN_AVAILABLE = True
+except ImportError:
+    FLASH_ATTN_AVAILABLE = False
+    print("Flash Attention is not installed. Using default attention mechanism.")
+    print("To install Flash Attention, run: pip install flash-attn --no-build-isolation")
 # Get API token from environment variable
 api_token = os.getenv("HF_TOKEN").strip()
     bnb_4bit_compute_dtype=torch.float16
 )
+# Initialize model with conditional Flash Attention
+model_args = {
+    "quantization_config": bnb_config,
+    "device_map": "auto",
+    "torch_dtype": torch.float16,
+    "trust_remote_code": True,
+    "token": api_token
+}
+# Only add flash attention if available
+if FLASH_ATTN_AVAILABLE:
+    model_args["attn_implementation"] = "flash_attention_2"
 # Initialize model and tokenizer
 model = AutoModel.from_pretrained(
     "ContactDoctor/Bio-Medical-MultiModal-Llama-3-8B-V1",
+    **model_args
 )
 tokenizer = AutoTokenizer.from_pretrained(
 # Launch the Gradio app
 if __name__ == "__main__":
+    # Print installation instructions if Flash Attention is not available
+    if not FLASH_ATTN_AVAILABLE:
+        print("\nTo enable Flash Attention 2 for better performance, please install it using:")
+        print("pip install flash-attn --no-build-isolation")
     demo.launch(
         share=True,
         server_name="0.0.0.0",