speedup
Browse files
app.py
CHANGED
@@ -1,10 +1,15 @@
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
|
|
3 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
4 |
|
5 |
-
# Load the model and tokenizer
|
6 |
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Chronoboros-33B-GPTQ")
|
7 |
model = AutoModelForCausalLM.from_pretrained("TheBloke/Chronoboros-33B-GPTQ", device_map="auto")
|
|
|
|
|
|
|
|
|
8 |
|
9 |
@spaces.GPU
|
10 |
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
|
@@ -17,21 +22,26 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
|
|
17 |
prompt += f"Assistant: {assistant_text}\n"
|
18 |
prompt += f"User: {message}\nAssistant: "
|
19 |
|
20 |
-
# Tokenize the prompt
|
21 |
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
new_tokens = output_ids[0][input_ids.shape[1]:]
|
31 |
-
|
32 |
-
#
|
33 |
-
|
34 |
-
|
|
|
35 |
yield current_response
|
36 |
|
37 |
# Configure the ChatInterface with additional inputs
|
|
|
1 |
import gradio as gr
|
2 |
import spaces
|
3 |
+
import torch
|
4 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
5 |
|
6 |
+
# Load the model and tokenizer
|
7 |
tokenizer = AutoTokenizer.from_pretrained("TheBloke/Chronoboros-33B-GPTQ")
|
8 |
model = AutoModelForCausalLM.from_pretrained("TheBloke/Chronoboros-33B-GPTQ", device_map="auto")
|
9 |
+
model.eval() # set model to evaluation mode
|
10 |
+
|
11 |
+
# Optional: Use torch.compile() if you're on PyTorch 2.0+ for further speed-up
|
12 |
+
# model = torch.compile(model)
|
13 |
|
14 |
@spaces.GPU
|
15 |
def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
|
|
|
22 |
prompt += f"Assistant: {assistant_text}\n"
|
23 |
prompt += f"User: {message}\nAssistant: "
|
24 |
|
25 |
+
# Tokenize the prompt
|
26 |
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
|
27 |
+
|
28 |
+
# Generate the response with no gradients
|
29 |
+
with torch.no_grad():
|
30 |
+
output_ids = model.generate(
|
31 |
+
input_ids,
|
32 |
+
max_new_tokens=max_tokens,
|
33 |
+
temperature=temperature,
|
34 |
+
top_p=top_p,
|
35 |
+
do_sample=True,
|
36 |
+
)
|
37 |
+
|
38 |
+
# Extract the new tokens
|
39 |
new_tokens = output_ids[0][input_ids.shape[1]:]
|
40 |
+
|
41 |
+
# Stream output in chunks (e.g., 5 tokens per chunk)
|
42 |
+
chunk_size = 5
|
43 |
+
for i in range(0, new_tokens.shape[0], chunk_size):
|
44 |
+
current_response = tokenizer.decode(new_tokens[: i + chunk_size], skip_special_tokens=True)
|
45 |
yield current_response
|
46 |
|
47 |
# Configure the ChatInterface with additional inputs
|