khushi1234455687 commited on
Commit
6e7ba05
Β·
verified Β·
1 Parent(s): 8e67518

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -18
app.py CHANGED
@@ -1,43 +1,48 @@
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
5
  from peft import PeftModel
6
- import os
7
  from huggingface_hub import login
8
 
9
  # βœ… Read token from Hugging Face Secrets
10
  HF_TOKEN = os.getenv("HF_TOKEN")
11
 
12
- # βœ… Login only if token exists
13
  if HF_TOKEN:
14
- login(token=HF_TOKEN)
15
 
16
  # βœ… Initialize FastAPI
17
  app = FastAPI()
18
 
19
- # βœ… Define Base Model & LoRA Adapter Repository (Smaller Model for Hugging Face Spaces)
20
- base_model_name = "mistralai/Mistral-7B-Instruct-v0.2" # πŸ”Ή Using a smaller model
21
  lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8"
22
 
23
- # βœ… Automatically Select CPU (Hugging Face Spaces Does NOT Support GPU)
24
- device = "cuda" if torch.cuda.is_available() else "cpu"
25
 
26
  # βœ… Load Tokenizer
27
- tokenizer = AutoTokenizer.from_pretrained(base_model_name)
28
 
29
- # βœ… Configure 4-bit Quantization
30
- quantization_config = BitsAndBytesConfig(
31
- load_in_4bit=True
32
- )
33
 
34
- # βœ… Load Base Model (Optimized for CPU)
35
  try:
36
  base_model = AutoModelForCausalLM.from_pretrained(
37
  base_model_name,
38
  quantization_config=quantization_config,
39
- device_map="auto", # βœ… Automatically assigns layers to CPU
40
- torch_dtype=torch.float16
 
41
  )
42
  except Exception as e:
43
  print(f"❌ Error loading base model: {e}")
@@ -45,8 +50,8 @@ except Exception as e:
45
 
46
  # βœ… Load LoRA Adapter
47
  try:
48
- model = PeftModel.from_pretrained(base_model, lora_repo_id)
49
- model.to(device) # βœ… Ensure model is on the correct device
50
  model.eval()
51
  except Exception as e:
52
  print(f"❌ Error loading LoRA adapter: {e}")
@@ -62,7 +67,7 @@ class QueryRequest(BaseModel):
62
  async def generate_answer(request: QueryRequest):
63
  """Generate an answer for a given medical question."""
64
  try:
65
- inputs = tokenizer(request.question, return_tensors="pt").to(device) # βœ… Move to device
66
  with torch.no_grad():
67
  output = model.generate(**inputs, max_length=256)
68
  answer = tokenizer.decode(output[0], skip_special_tokens=True)
 
1
+ import os
2
+
3
+ # βœ… Set a writable cache directory inside `/app`
4
+ os.environ["HF_HOME"] = "/app/huggingface"
5
+ os.environ["TRANSFORMERS_CACHE"] = "/app/huggingface"
6
+ os.environ["HF_HUB_CACHE"] = "/app/huggingface"
7
+
8
  from fastapi import FastAPI
9
  from pydantic import BaseModel
10
  import torch
11
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
12
  from peft import PeftModel
 
13
  from huggingface_hub import login
14
 
15
  # βœ… Read token from Hugging Face Secrets
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
 
18
+ # βœ… Login only if token exists (Prevent writing to protected directories)
19
  if HF_TOKEN:
20
+ login(token=HF_TOKEN, cache_dir="/app/huggingface")
21
 
22
  # βœ… Initialize FastAPI
23
  app = FastAPI()
24
 
25
+ # βœ… Define Base Model & LoRA Adapter Repository (Use a Smaller Model)
26
+ base_model_name = "mistralai/Mistral-7B-Instruct-v0.1" # πŸ”Ή Switched to a smaller model
27
  lora_repo_id = "khushi1234455687/fine-tuned-medical-qa-V8"
28
 
29
+ # βœ… Force CPU Usage (Hugging Face Spaces Does NOT Support GPUs)
30
+ device = "cpu"
31
 
32
  # βœ… Load Tokenizer
33
+ tokenizer = AutoTokenizer.from_pretrained(base_model_name, cache_dir="/app/huggingface")
34
 
35
+ # βœ… Configure 4-bit Quantization (Optimized for Spaces)
36
+ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 
 
37
 
38
+ # βœ… Load Base Model
39
  try:
40
  base_model = AutoModelForCausalLM.from_pretrained(
41
  base_model_name,
42
  quantization_config=quantization_config,
43
+ device_map="cpu",
44
+ torch_dtype=torch.float16,
45
+ cache_dir="/app/huggingface"
46
  )
47
  except Exception as e:
48
  print(f"❌ Error loading base model: {e}")
 
50
 
51
  # βœ… Load LoRA Adapter
52
  try:
53
+ model = PeftModel.from_pretrained(base_model, lora_repo_id, cache_dir="/app/huggingface")
54
+ model.to(device)
55
  model.eval()
56
  except Exception as e:
57
  print(f"❌ Error loading LoRA adapter: {e}")
 
67
  async def generate_answer(request: QueryRequest):
68
  """Generate an answer for a given medical question."""
69
  try:
70
+ inputs = tokenizer(request.question, return_tensors="pt").to(device)
71
  with torch.no_grad():
72
  output = model.generate(**inputs, max_length=256)
73
  answer = tokenizer.decode(output[0], skip_special_tokens=True)