desert
commited on
Commit
·
21886ee
1
Parent(s):
f84cd21
del
Browse files
app.py
CHANGED
@@ -2,20 +2,17 @@ import gradio as gr
|
|
2 |
from unsloth import FastLanguageModel
|
3 |
import torch
|
4 |
|
|
|
|
|
|
|
5 |
|
6 |
-
# Load your model and tokenizer (make sure to adjust the path to where your model is stored)
|
7 |
-
max_seq_length = 2048 # Adjust as necessary
|
8 |
-
load_in_4bit = True # Enable 4-bit quantization for reduced memory usage
|
9 |
-
model_path = "llama_lora_model_1" # Path to your custom model
|
10 |
-
|
11 |
-
# Load the model and tokenizer
|
12 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
13 |
-
model_name=
|
14 |
-
max_seq_length=max_seq_length,
|
15 |
-
|
|
|
16 |
)
|
17 |
|
18 |
-
|
19 |
# Respond function
|
20 |
def respond(
|
21 |
message,
|
|
|
2 |
from unsloth import FastLanguageModel
|
3 |
import torch
|
4 |
|
5 |
+
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
|
6 |
+
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
7 |
+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
10 |
+
model_name = "llama_lora_model_1",
|
11 |
+
max_seq_length = max_seq_length,
|
12 |
+
dtype = dtype,
|
13 |
+
load_in_4bit = load_in_4bit,
|
14 |
)
|
15 |
|
|
|
16 |
# Respond function
|
17 |
def respond(
|
18 |
message,
|