Spaces:

khushi1234455687
/

medical-qa-api

Runtime error

App Files Files Community

khushi1234455687 commited on Feb 16

Commit

6e7ba05

verified ·

1 Parent(s): 8e67518

Upload app.py

Browse files

Files changed (1) hide show

app.py +23 -18

app.py CHANGED Viewed

@@ -1,43 +1,48 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
-import os
 from huggingface_hub import login
 # ✅ Read token from Hugging Face Secrets
 HF_TOKEN = os.getenv("HF_TOKEN")
-# ✅ Login only if token exists
 if HF_TOKEN:
-    login(token=HF_TOKEN)
 # ✅ Initialize FastAPI
 app = FastAPI()
-# ✅ Define Base Model & LoRA Adapter Repository (Smaller Model for Hugging Face Spaces)
-base_model_name = "mistralai/Mistral-7B-Instruct-v0.2"  # 🔹 Using a smaller model
 lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8"
-# ✅ Automatically Select CPU (Hugging Face Spaces Does NOT Support GPU)
-device = "cuda" if torch.cuda.is_available() else "cpu"
 # ✅ Load Tokenizer
-tokenizer = AutoTokenizer.from_pretrained(base_model_name)
-# ✅ Configure 4-bit Quantization
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True
-)
-# ✅ Load Base Model (Optimized for CPU)
 try:
     base_model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         quantization_config=quantization_config,
-        device_map="auto",  # ✅ Automatically assigns layers to CPU
-        torch_dtype=torch.float16
     )
 except Exception as e:
     print(f"❌ Error loading base model: {e}")
@@ -45,8 +50,8 @@ except Exception as e:
 # ✅ Load LoRA Adapter
 try:
-    model = PeftModel.from_pretrained(base_model, lora_repo_id)
-    model.to(device)  # ✅ Ensure model is on the correct device
     model.eval()
 except Exception as e:
     print(f"❌ Error loading LoRA adapter: {e}")
@@ -62,7 +67,7 @@ class QueryRequest(BaseModel):
 async def generate_answer(request: QueryRequest):
     """Generate an answer for a given medical question."""
     try:
-        inputs = tokenizer(request.question, return_tensors="pt").to(device)  # ✅ Move to device
         with torch.no_grad():
             output = model.generate(**inputs, max_length=256)
         answer = tokenizer.decode(output[0], skip_special_tokens=True)

+import os
+# ✅ Set a writable cache directory inside `/app`
+os.environ["HF_HOME"] = "/app/huggingface"
+os.environ["TRANSFORMERS_CACHE"] = "/app/huggingface"
+os.environ["HF_HUB_CACHE"] = "/app/huggingface"
 from fastapi import FastAPI
 from pydantic import BaseModel
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from peft import PeftModel
 from huggingface_hub import login
 # ✅ Read token from Hugging Face Secrets
 HF_TOKEN = os.getenv("HF_TOKEN")
+# ✅ Login only if token exists (Prevent writing to protected directories)
 if HF_TOKEN:
+    login(token=HF_TOKEN, cache_dir="/app/huggingface")
 # ✅ Initialize FastAPI
 app = FastAPI()
+# ✅ Define Base Model & LoRA Adapter Repository (Use a Smaller Model)
+base_model_name = "mistralai/Mistral-7B-Instruct-v0.1"  # 🔹 Switched to a smaller model
 lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8"
+# ✅ Force CPU Usage (Hugging Face Spaces Does NOT Support GPUs)
+device = "cpu"
 # ✅ Load Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(base_model_name, cache_dir="/app/huggingface")
+# ✅ Configure 4-bit Quantization (Optimized for Spaces)
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+# ✅ Load Base Model
 try:
     base_model = AutoModelForCausalLM.from_pretrained(
         base_model_name,
         quantization_config=quantization_config,
+        device_map="cpu",
+        torch_dtype=torch.float16,
+        cache_dir="/app/huggingface"
     )
 except Exception as e:
     print(f"❌ Error loading base model: {e}")
 # ✅ Load LoRA Adapter
 try:
+    model = PeftModel.from_pretrained(base_model, lora_repo_id, cache_dir="/app/huggingface")
+    model.to(device)
     model.eval()
 except Exception as e:
     print(f"❌ Error loading LoRA adapter: {e}")
 async def generate_answer(request: QueryRequest):
     """Generate an answer for a given medical question."""
     try:
+        inputs = tokenizer(request.question, return_tensors="pt").to(device)
         with torch.no_grad():
             output = model.generate(**inputs, max_length=256)
         answer = tokenizer.decode(output[0], skip_special_tokens=True)