Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from unsloth import FastLanguageModel
|
2 |
import torch
|
3 |
import gradio as gr
|
4 |
import os
|
@@ -7,8 +6,12 @@ from typing import Iterator
|
|
7 |
from transformers import (
|
8 |
AutoModelForCausalLM,
|
9 |
BitsAndBytesConfig,
|
|
|
|
|
10 |
TextIteratorStreamer,
|
11 |
)
|
|
|
|
|
12 |
|
13 |
|
14 |
#deklarasi
|
@@ -23,21 +26,23 @@ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False
|
|
23 |
### Response:
|
24 |
#{}"""
|
25 |
|
26 |
-
if
|
27 |
-
from unsloth import FastLanguageModel
|
28 |
-
model, tokenizer = FastLanguageModel.from_pretrained(
|
29 |
-
model_name = "abdfajar707/llama3_8B_lora_model_rkp_pn2025_v3", # YOUR MODEL YOU USED FOR TRAINING
|
30 |
-
max_seq_length = max_seq_length,
|
31 |
-
dtype = dtype,
|
32 |
-
load_in_4bit = load_in_4bit,
|
33 |
-
)
|
34 |
-
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
|
35 |
-
|
36 |
-
|
37 |
MAX_MAX_NEW_TOKENS = 2048
|
38 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
39 |
-
MAX_INPUT_TOKEN_LENGTH =
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
model.config.sliding_window = 4096
|
43 |
model.eval()
|
@@ -128,7 +133,7 @@ chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Interlinked Sytem
|
|
128 |
|
129 |
chat_interface = gr.ChatInterface(
|
130 |
fn=generate,
|
131 |
-
|
132 |
additional_inputs=[
|
133 |
gr.Slider(
|
134 |
label="Max new tokens",
|
@@ -183,5 +188,4 @@ with gr.Blocks(css=css, fill_height=True) as demo:
|
|
183 |
chat_interface.render()
|
184 |
|
185 |
if __name__ == "__main__":
|
186 |
-
demo.queue(max_size=20).launch()
|
187 |
-
|
|
|
|
|
1 |
import torch
|
2 |
import gradio as gr
|
3 |
import os
|
|
|
6 |
from transformers import (
|
7 |
AutoModelForCausalLM,
|
8 |
BitsAndBytesConfig,
|
9 |
+
GenerationConfig,
|
10 |
+
AutoTokenizer,
|
11 |
TextIteratorStreamer,
|
12 |
)
|
13 |
+
from peft import AutoPeftModelForCausalLM
|
14 |
+
|
15 |
|
16 |
|
17 |
#deklarasi
|
|
|
26 |
### Response:
|
27 |
#{}"""
|
28 |
|
29 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
MAX_MAX_NEW_TOKENS = 2048
|
31 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
32 |
+
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
33 |
+
|
34 |
+
model_id = "abdfajar707/llama3_8B_lora_model_rkp_pn2025_v3"
|
35 |
+
#tokenizer = LlamaTokenizer.from_pretrained(model_id)
|
36 |
+
#model, tokenizer = AutoModelForCausalLM.from_pretrained(
|
37 |
+
# model_id,
|
38 |
+
# device_map="auto",
|
39 |
+
# quantization_config=BitsAndBytesConfig(load_in_8bit=True),
|
40 |
+
#)
|
41 |
+
model = AutoPeftModelForCausalLM.from_pretrained(
|
42 |
+
model_id, # YOUR MODEL YOU USED FOR TRAINING
|
43 |
+
load_in_4bit = load_in_4bit,
|
44 |
+
)
|
45 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
46 |
|
47 |
model.config.sliding_window = 4096
|
48 |
model.eval()
|
|
|
133 |
|
134 |
chat_interface = gr.ChatInterface(
|
135 |
fn=generate,
|
136 |
+
chatbot=chatbot,
|
137 |
additional_inputs=[
|
138 |
gr.Slider(
|
139 |
label="Max new tokens",
|
|
|
188 |
chat_interface.render()
|
189 |
|
190 |
if __name__ == "__main__":
|
191 |
+
demo.queue(max_size=20).launch()
|
|