Backup-bdg commited on
Commit
0b73bfa
·
verified ·
1 Parent(s): 8203986

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -6
app.py CHANGED
@@ -3,6 +3,7 @@ import spaces
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
  from fastapi import FastAPI, HTTPException
 
6
  import uvicorn
7
  import json
8
 
@@ -13,17 +14,25 @@ app = FastAPI()
13
  CHECKPOINT = "bigcode/starcoder2-15b"
14
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
- # Load model and tokenizer with ZeroGPU
17
  @spaces.GPU(duration=120)
18
  def load_model_and_generate(prompt, max_length=256, temperature=0.2, top_p=0.95):
19
  try:
20
  # Initialize tokenizer
21
  tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
22
 
23
- # Initialize model
 
 
 
 
 
 
 
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
  CHECKPOINT,
26
- torch_dtype=torch.bfloat16,
27
  device_map="auto"
28
  )
29
 
@@ -32,8 +41,7 @@ def load_model_and_generate(prompt, max_length=256, temperature=0.2, top_p=0.95)
32
  "text-generation",
33
  model=model,
34
  tokenizer=tokenizer,
35
- device_map="auto",
36
- torch_dtype=torch.bfloat16
37
  )
38
 
39
  # Format prompt for chat-like interaction
@@ -80,7 +88,7 @@ async def backdoor_chat(request: dict):
80
 
81
  # Gradio interface setup
82
  with gr.Blocks() as demo:
83
- gr.Markdown("# StarCoder2-15B Chat Interface")
84
  gr.Markdown("Enter a prompt to generate code or simulate a chat. Use the API endpoint `/backdoor-chat` for programmatic access.")
85
 
86
  # Input components
 
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
5
  from fastapi import FastAPI, HTTPException
6
+ from transformers import BitsAndBytesConfig
7
  import uvicorn
8
  import json
9
 
 
14
  CHECKPOINT = "bigcode/starcoder2-15b"
15
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
+ # Load model and tokenizer with 4-bit quantization
18
  @spaces.GPU(duration=120)
19
  def load_model_and_generate(prompt, max_length=256, temperature=0.2, top_p=0.95):
20
  try:
21
  # Initialize tokenizer
22
  tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
23
 
24
+ # Configure 4-bit quantization
25
+ quantization_config = BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_compute_dtype=torch.bfloat16,
28
+ bnb_4bit_quant_type="nf4",
29
+ bnb_4bit_use_double_quant=True
30
+ )
31
+
32
+ # Initialize model with rs
33
  model = AutoModelForCausalLM.from_pretrained(
34
  CHECKPOINT,
35
+ quantization_config=quantization_config,
36
  device_map="auto"
37
  )
38
 
 
41
  "text-generation",
42
  model=model,
43
  tokenizer=tokenizer,
44
+ device_map="auto"
 
45
  )
46
 
47
  # Format prompt for chat-like interaction
 
88
 
89
  # Gradio interface setup
90
  with gr.Blocks() as demo:
91
+ gr.Markdown("# StarCoder2-15B Chat Interface (4-bit Quantization)")
92
  gr.Markdown("Enter a prompt to generate code or simulate a chat. Use the API endpoint `/backdoor-chat` for programmatic access.")
93
 
94
  # Input components