Spaces:
Runtime error
Runtime error
from fastapi import FastAPI | |
from pydantic import BaseModel | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
from peft import PeftModel | |
import os | |
from huggingface_hub import login | |
# β Read token from Hugging Face Secrets | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
# β Login only if token exists | |
if HF_TOKEN: | |
login(token=HF_TOKEN) | |
# β Initialize FastAPI | |
app = FastAPI() | |
# β Define Base Model & LoRA Adapter Repository | |
base_model_name = "mistralai/Mistral-7B-v0.1" | |
lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8" | |
# β Load Tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name) | |
# β Configure 4-bit Quantization | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
llm_int8_enable_fp32_cpu_offload=True, | |
offload_buffers=True | |
) | |
# β Load Base Model | |
base_model = AutoModelForCausalLM.from_pretrained( | |
base_model_name, | |
quantization_config=quantization_config, | |
device_map="auto", | |
torch_dtype=torch.float16 | |
) | |
# β Load LoRA Adapter | |
model = PeftModel.from_pretrained(base_model, lora_repo_id) | |
model.eval() | |
print("β Model is loaded and API is ready!") | |
# β Define Request Body Format | |
class QueryRequest(BaseModel): | |
question: str | |
async def generate_answer(request: QueryRequest): | |
"""Generate an answer for a given medical question.""" | |
inputs = tokenizer(request.question, return_tensors="pt").to("cuda") | |
with torch.no_grad(): | |
output = model.generate(**inputs, max_length=256) | |
answer = tokenizer.decode(output[0], skip_special_tokens=True) | |
return {"question": request.question, "answer": answer} | |