Spaces:

Backup-bdg
/

main-model

Runtime error

App Files Files Community

Backup-bdg commited on May 24

Commit

0b73bfa

verified ·

1 Parent(s): 8203986

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -6

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from fastapi import FastAPI, HTTPException
 import uvicorn
 import json
@@ -13,17 +14,25 @@ app = FastAPI()
 CHECKPOINT = "bigcode/starcoder2-15b"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# Load model and tokenizer with ZeroGPU
 @spaces.GPU(duration=120)
 def load_model_and_generate(prompt, max_length=256, temperature=0.2, top_p=0.95):
     try:
         # Initialize tokenizer
         tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
-        # Initialize model
         model = AutoModelForCausalLM.from_pretrained(
             CHECKPOINT,
-            torch_dtype=torch.bfloat16,
             device_map="auto"
         )
@@ -32,8 +41,7 @@ def load_model_and_generate(prompt, max_length=256, temperature=0.2, top_p=0.95)
             "text-generation",
             model=model,
             tokenizer=tokenizer,
-            device_map="auto",
-            torch_dtype=torch.bfloat16
         )
         # Format prompt for chat-like interaction
@@ -80,7 +88,7 @@ async def backdoor_chat(request: dict):
 # Gradio interface setup
 with gr.Blocks() as demo:
-    gr.Markdown("# StarCoder2-15B Chat Interface")
     gr.Markdown("Enter a prompt to generate code or simulate a chat. Use the API endpoint `/backdoor-chat` for programmatic access.")
     # Input components

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 from fastapi import FastAPI, HTTPException
+from transformers import BitsAndBytesConfig
 import uvicorn
 import json
 CHECKPOINT = "bigcode/starcoder2-15b"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model and tokenizer with 4-bit quantization
 @spaces.GPU(duration=120)
 def load_model_and_generate(prompt, max_length=256, temperature=0.2, top_p=0.95):
     try:
         # Initialize tokenizer
         tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
+        # Configure 4-bit quantization
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True
+        )
+        # Initialize model with rs
         model = AutoModelForCausalLM.from_pretrained(
             CHECKPOINT,
+            quantization_config=quantization_config,
             device_map="auto"
         )
             "text-generation",
             model=model,
             tokenizer=tokenizer,
+            device_map="auto"
         )
         # Format prompt for chat-like interaction
 # Gradio interface setup
 with gr.Blocks() as demo:
+    gr.Markdown("# StarCoder2-15B Chat Interface (4-bit Quantization)")
     gr.Markdown("Enter a prompt to generate code or simulate a chat. Use the API endpoint `/backdoor-chat` for programmatic access.")
     # Input components