Spaces:

LAWSA07
/

medical_model

Runtime error

App Files Files Community

LAWSA07 commited on Feb 28

Commit

738974d

verified ·

1 Parent(s): 73dc516

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -48

app.py CHANGED Viewed

@@ -1,72 +1,42 @@
 from fastapi import FastAPI, HTTPException
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 import torch
 app = FastAPI()
-# Load model once at startup
 @app.on_event("startup")
 async def load_model():
     try:
-        # Configuration
-        model_name = "unsloth/deepseek-r1-distill-llama-8b-unsloth-bnb-4bit"
-        adapter_name = "LAWSA07/medical_fine_tuned_deepseekR1"
-        # Load base model with 4-bit quantization
-        app.state.base_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
             load_in_4bit=True,
-            torch_dtype=torch.float16,
             device_map="auto",
-            trust_remote_code=True,
         )
         # Attach PEFT adapter
         app.state.model = PeftModel.from_pretrained(
             app.state.base_model,
-            adapter_name,
-            adapter_weight_name="adapter_model.safetensors"
         )
         # Load tokenizer
-        app.state.tokenizer = AutoTokenizer.from_pretrained(model_name)
-    except Exception as e:
-        raise HTTPException(
-            status_code=500,
-            detail=f"Model loading failed: {str(e)}"
         )
-@app.get("/")
-def health_check():
-    return {"status": "OK"}
-@app.post("/generate")
-async def generate_text(prompt: str, max_length: int = 200):
-    try:
-        inputs = app.state.tokenizer(
-            prompt,
-            return_tensors="pt",
-            padding=True
-        ).to("cuda")
-        outputs = app.state.model.generate(
-            **inputs,
-            max_length=max_length,
-            temperature=0.7,
-            do_sample=True
-        )
-        decoded = app.state.tokenizer.decode(
-            outputs[0],
-            skip_special_tokens=True
-        )
-        return {"response": decoded}
     except Exception as e:
         raise HTTPException(
             status_code=500,
-            detail=f"Generation failed: {str(e)}"
         )

 from fastapi import FastAPI, HTTPException
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
 import torch
 app = FastAPI()
 @app.on_event("startup")
 async def load_model():
     try:
+        # 4-bit config
+        bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+        # Load base model
+        app.state.base_model = AutoModelForCausalLM.from_pretrained(
+            "unsloth/deepseek-r1-distill-llama-8b-unsloth-bnb-4bit",
+            quantization_config=bnb_config,
             device_map="auto",
+            trust_remote_code=True
         )
         # Attach PEFT adapter
         app.state.model = PeftModel.from_pretrained(
             app.state.base_model,
+            "LAWSA07/medical_fine_tuned_deepseekR1"
         )
         # Load tokenizer
+        app.state.tokenizer = AutoTokenizer.from_pretrained(
+            "unsloth/deepseek-r1-distill-llama-8b-unsloth-bnb-4bit"
         )
     except Exception as e:
         raise HTTPException(
             status_code=500,
+            detail=f"Model loading failed: {str(e)}"
         )