Spaces:
Runtime error
Runtime error
File size: 1,735 Bytes
4d80ee4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os
from huggingface_hub import login
# β
Read token from Hugging Face Secrets
HF_TOKEN = os.getenv("HF_TOKEN")
# β
Login only if token exists
if HF_TOKEN:
login(token=HF_TOKEN)
# β
Initialize FastAPI
app = FastAPI()
# β
Define Base Model & LoRA Adapter Repository
base_model_name = "mistralai/Mistral-7B-v0.1"
lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8"
# β
Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
# β
Configure 4-bit Quantization
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_enable_fp32_cpu_offload=True,
offload_buffers=True
)
# β
Load Base Model
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=quantization_config,
device_map="auto",
torch_dtype=torch.float16
)
# β
Load LoRA Adapter
model = PeftModel.from_pretrained(base_model, lora_repo_id)
model.eval()
print("β
Model is loaded and API is ready!")
# β
Define Request Body Format
class QueryRequest(BaseModel):
question: str
@app.post("/generate")
async def generate_answer(request: QueryRequest):
"""Generate an answer for a given medical question."""
inputs = tokenizer(request.question, return_tensors="pt").to("cuda")
with torch.no_grad():
output = model.generate(**inputs, max_length=256)
answer = tokenizer.decode(output[0], skip_special_tokens=True)
return {"question": request.question, "answer": answer}
|