medical-qa-api / app.py
khushi1234455687's picture
Upload app.py
4d80ee4 verified
raw
history blame
1.74 kB
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os
from huggingface_hub import login
# βœ… Read token from Hugging Face Secrets
HF_TOKEN = os.getenv("HF_TOKEN")
# βœ… Login only if token exists
if HF_TOKEN:
login(token=HF_TOKEN)
# βœ… Initialize FastAPI
app = FastAPI()
# βœ… Define Base Model & LoRA Adapter Repository
base_model_name = "mistralai/Mistral-7B-v0.1"
lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8"
# βœ… Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# βœ… Configure 4-bit Quantization
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_enable_fp32_cpu_offload=True,
offload_buffers=True
)
# βœ… Load Base Model
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=quantization_config,
device_map="auto",
torch_dtype=torch.float16
)
# βœ… Load LoRA Adapter
model = PeftModel.from_pretrained(base_model, lora_repo_id)
model.eval()
print("βœ… Model is loaded and API is ready!")
# βœ… Define Request Body Format
class QueryRequest(BaseModel):
question: str
@app.post("/generate")
async def generate_answer(request: QueryRequest):
"""Generate an answer for a given medical question."""
inputs = tokenizer(request.question, return_tensors="pt").to("cuda")
with torch.no_grad():
output = model.generate(**inputs, max_length=256)
answer = tokenizer.decode(output[0], skip_special_tokens=True)
return {"question": request.question, "answer": answer}