abdfajar707 commited on
Commit
9de2232
·
verified ·
1 Parent(s): 1b8df0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -1,4 +1,3 @@
1
- from unsloth import FastLanguageModel
2
  import torch
3
  import gradio as gr
4
  import os
@@ -7,8 +6,12 @@ from typing import Iterator
7
  from transformers import (
8
  AutoModelForCausalLM,
9
  BitsAndBytesConfig,
 
 
10
  TextIteratorStreamer,
11
  )
 
 
12
 
13
 
14
  #deklarasi
@@ -23,21 +26,23 @@ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False
23
  ### Response:
24
  #{}"""
25
 
26
- if True:
27
- from unsloth import FastLanguageModel
28
- model, tokenizer = FastLanguageModel.from_pretrained(
29
- model_name = "abdfajar707/llama3_8B_lora_model_rkp_pn2025_v3", # YOUR MODEL YOU USED FOR TRAINING
30
- max_seq_length = max_seq_length,
31
- dtype = dtype,
32
- load_in_4bit = load_in_4bit,
33
- )
34
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
35
-
36
-
37
  MAX_MAX_NEW_TOKENS = 2048
38
  DEFAULT_MAX_NEW_TOKENS = 1024
39
- MAX_INPUT_TOKEN_LENGTH = 4096 #int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
40
-
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  model.config.sliding_window = 4096
43
  model.eval()
@@ -128,7 +133,7 @@ chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Interlinked Sytem
128
 
129
  chat_interface = gr.ChatInterface(
130
  fn=generate,
131
- #chatbot=chatbot,
132
  additional_inputs=[
133
  gr.Slider(
134
  label="Max new tokens",
@@ -183,5 +188,4 @@ with gr.Blocks(css=css, fill_height=True) as demo:
183
  chat_interface.render()
184
 
185
  if __name__ == "__main__":
186
- demo.queue(max_size=20).launch()
187
-
 
 
1
  import torch
2
  import gradio as gr
3
  import os
 
6
  from transformers import (
7
  AutoModelForCausalLM,
8
  BitsAndBytesConfig,
9
+ GenerationConfig,
10
+ AutoTokenizer,
11
  TextIteratorStreamer,
12
  )
13
+ from peft import AutoPeftModelForCausalLM
14
+
15
 
16
 
17
  #deklarasi
 
26
  ### Response:
27
  #{}"""
28
 
29
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
30
  MAX_MAX_NEW_TOKENS = 2048
31
  DEFAULT_MAX_NEW_TOKENS = 1024
32
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
33
+
34
+ model_id = "abdfajar707/llama3_8B_lora_model_rkp_pn2025_v3"
35
+ #tokenizer = LlamaTokenizer.from_pretrained(model_id)
36
+ #model, tokenizer = AutoModelForCausalLM.from_pretrained(
37
+ # model_id,
38
+ # device_map="auto",
39
+ # quantization_config=BitsAndBytesConfig(load_in_8bit=True),
40
+ #)
41
+ model = AutoPeftModelForCausalLM.from_pretrained(
42
+ model_id, # YOUR MODEL YOU USED FOR TRAINING
43
+ load_in_4bit = load_in_4bit,
44
+ )
45
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
46
 
47
  model.config.sliding_window = 4096
48
  model.eval()
 
133
 
134
  chat_interface = gr.ChatInterface(
135
  fn=generate,
136
+ chatbot=chatbot,
137
  additional_inputs=[
138
  gr.Slider(
139
  label="Max new tokens",
 
188
  chat_interface.render()
189
 
190
  if __name__ == "__main__":
191
+ demo.queue(max_size=20).launch()