Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,13 +6,13 @@ import torch
|
|
6 |
|
7 |
print(torch.__version__)
|
8 |
|
9 |
-
import torch
|
10 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
11 |
import gradio as gr
|
12 |
from threading import Thread
|
|
|
13 |
|
14 |
-
MODEL_BIG = "HuggingFaceTB/SmolLM-
|
15 |
-
MODEL_SMALL = "HuggingFaceTB/SmolLM-
|
16 |
|
17 |
TITLE = "<h1><center>Auto-Guidance Playground</center></h1>"
|
18 |
SUB_TITLE = """<center>Auto-guidance was a technique made by NVIDIA for text-conditioned image models. This is a test of the concept with SmolLM.</center>"""
|
@@ -34,20 +34,21 @@ END_MESSAGE = """
|
|
34 |
**The conversation has reached to its end, please press "Clear" to restart a new conversation**
|
35 |
"""
|
36 |
|
37 |
-
device = "cpu" # for GPU usage or "cpu" for CPU usage
|
38 |
-
|
39 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_SMALL)
|
40 |
model_big = AutoModelForCausalLM.from_pretrained(
|
41 |
MODEL_BIG,
|
42 |
-
|
43 |
-
)
|
44 |
model_small = AutoModelForCausalLM.from_pretrained(
|
45 |
MODEL_SMALL,
|
46 |
-
|
47 |
-
)
|
|
|
|
|
|
|
48 |
|
49 |
-
if device == "cuda":
|
50 |
-
|
51 |
|
52 |
@spaces.GPU
|
53 |
def stream_chat(
|
@@ -84,7 +85,6 @@ def stream_chat(
|
|
84 |
logits_big = model_big(current_input).logits[:, -1, :]
|
85 |
|
86 |
probs_small = torch.softmax(logits_small / temperature, dim=-1)
|
87 |
-
probs_big = torch.softmax(logits_big / temperature, dim=-1)
|
88 |
|
89 |
interpolated_logits = logits_big + (guidance_scale - 1) * (logits_big - logits_small) * probs_small
|
90 |
|
|
|
6 |
|
7 |
print(torch.__version__)
|
8 |
|
|
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
10 |
import gradio as gr
|
11 |
from threading import Thread
|
12 |
+
import bitsandbytes as bnb
|
13 |
|
14 |
+
MODEL_BIG = "HuggingFaceTB/SmolLM-360M-Instruct"
|
15 |
+
MODEL_SMALL = "HuggingFaceTB/SmolLM-135M-Instruct"
|
16 |
|
17 |
TITLE = "<h1><center>Auto-Guidance Playground</center></h1>"
|
18 |
SUB_TITLE = """<center>Auto-guidance was a technique made by NVIDIA for text-conditioned image models. This is a test of the concept with SmolLM.</center>"""
|
|
|
34 |
**The conversation has reached to its end, please press "Clear" to restart a new conversation**
|
35 |
"""
|
36 |
|
|
|
|
|
37 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_SMALL)
|
38 |
model_big = AutoModelForCausalLM.from_pretrained(
|
39 |
MODEL_BIG,
|
40 |
+
load_in_8bit=True,
|
41 |
+
device_map="auto")
|
42 |
model_small = AutoModelForCausalLM.from_pretrained(
|
43 |
MODEL_SMALL,
|
44 |
+
load_in_8bit=True,
|
45 |
+
device_map="auto")
|
46 |
+
|
47 |
+
if model_big.device == "cuda":
|
48 |
+
model_big = torch.compile(model_big)
|
49 |
|
50 |
+
if model_small.device == "cuda":
|
51 |
+
model_small = torch.compile(model_small)
|
52 |
|
53 |
@spaces.GPU
|
54 |
def stream_chat(
|
|
|
85 |
logits_big = model_big(current_input).logits[:, -1, :]
|
86 |
|
87 |
probs_small = torch.softmax(logits_small / temperature, dim=-1)
|
|
|
88 |
|
89 |
interpolated_logits = logits_big + (guidance_scale - 1) * (logits_big - logits_small) * probs_small
|
90 |
|