Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -68,14 +68,14 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
|
|
68 |
# Load model
|
69 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
70 |
quantization_config = BitsAndBytesConfig(
|
71 |
-
|
72 |
-
bnb_4bit_compute_dtype=torch.bfloat16
|
73 |
)
|
74 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
75 |
model = AutoModelForCausalLM.from_pretrained(
|
76 |
MODEL_ID,
|
77 |
device_map="auto",
|
78 |
-
|
79 |
attn_implementation="flash_attention_2",
|
80 |
)
|
81 |
|
|
|
68 |
# Load model
|
69 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
70 |
quantization_config = BitsAndBytesConfig(
|
71 |
+
load_in_8bit=True,
|
72 |
+
# bnb_4bit_compute_dtype=torch.bfloat16
|
73 |
)
|
74 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
75 |
model = AutoModelForCausalLM.from_pretrained(
|
76 |
MODEL_ID,
|
77 |
device_map="auto",
|
78 |
+
quantization_config=quantization_config,
|
79 |
attn_implementation="flash_attention_2",
|
80 |
)
|
81 |
|