Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ import subprocess
|
|
2 |
import sys
|
3 |
|
4 |
# Cài đặt các thư viện nếu chưa có
|
5 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers", "streamlit", "torch", "peft"])
|
6 |
|
7 |
import streamlit as st
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
|
@@ -13,20 +13,15 @@ import torch
|
|
13 |
BASE_MODEL_NAME = "unsloth/deepseek-r1-distill-llama-8b-unsloth-bnb-4bit"
|
14 |
ADAPTER_MODEL_PATH = "lora_model"
|
15 |
|
16 |
-
# Load mô hình gốc
|
17 |
-
base_model = AutoModelForCausalLM.from_pretrained(
|
18 |
-
BASE_MODEL_NAME,
|
19 |
-
torch_dtype=torch.float32, # sử dụng float32 cho CPU
|
20 |
-
device_map="cpu" # ép chạy trên CPU
|
21 |
-
)
|
22 |
# Áp dụng adapter LoRA
|
23 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_PATH)
|
24 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
|
25 |
|
26 |
def generate_response(prompt):
|
27 |
"""Generate a response from the model."""
|
28 |
-
|
29 |
-
inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
|
30 |
streamer = TextStreamer(tokenizer)
|
31 |
with torch.no_grad():
|
32 |
model.generate(**inputs, streamer=streamer, max_length=512)
|
@@ -59,4 +54,4 @@ if user_input:
|
|
59 |
st.markdown(response)
|
60 |
|
61 |
# Append assistant response
|
62 |
-
st.session_state.messages.append({"role": "assistant", "content": response})
|
|
|
2 |
import sys
|
3 |
|
4 |
# Cài đặt các thư viện nếu chưa có
|
5 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers", "streamlit", "torch", "bitsandbytes","peft"])
|
6 |
|
7 |
import streamlit as st
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
|
|
|
13 |
BASE_MODEL_NAME = "unsloth/deepseek-r1-distill-llama-8b-unsloth-bnb-4bit"
|
14 |
ADAPTER_MODEL_PATH = "lora_model"
|
15 |
|
16 |
+
# Load mô hình gốc
|
17 |
+
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME, torch_dtype=torch.float16, device_map="auto")
|
|
|
|
|
|
|
|
|
18 |
# Áp dụng adapter LoRA
|
19 |
model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL_PATH)
|
20 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
|
21 |
|
22 |
def generate_response(prompt):
|
23 |
"""Generate a response from the model."""
|
24 |
+
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
|
|
|
25 |
streamer = TextStreamer(tokenizer)
|
26 |
with torch.no_grad():
|
27 |
model.generate(**inputs, streamer=streamer, max_length=512)
|
|
|
54 |
st.markdown(response)
|
55 |
|
56 |
# Append assistant response
|
57 |
+
st.session_state.messages.append({"role": "assistant", "content": response})
|