Spaces:
Runtime error
Runtime error
JoFrost
commited on
Commit
·
3e1ba39
1
Parent(s):
02e0932
feat: remote logic
Browse files- app.py +20 -71
- requirements.txt +6 -0
app.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
import gradio as gr
|
2 |
-
import os
|
3 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
-
from huggingface_hub import hf_hub_download
|
5 |
-
import numpy as np
|
6 |
import torch
|
7 |
-
import pickle
|
8 |
import numpy as np
|
9 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
from sklearn.linear_model import LogisticRegression
|
11 |
|
12 |
torch.set_grad_enabled(False) # avoid blowing up mem
|
@@ -22,6 +24,8 @@ params = {
|
|
22 |
"filename" : "layer_31/width_16k/average_l0_76/params.npz"
|
23 |
}
|
24 |
|
|
|
|
|
25 |
model_name = params["model_name"]
|
26 |
width = params["width"]
|
27 |
layer = params["layer"]
|
@@ -29,23 +33,17 @@ l0 = params["l0"]
|
|
29 |
sae_repo_id = params["sae_repo_id"]
|
30 |
filename = params["filename"]
|
31 |
|
32 |
-
C = 0.01
|
33 |
-
|
34 |
-
model = AutoModelForCausalLM.from_pretrained(
|
35 |
-
model_name,
|
36 |
-
device_map='auto',
|
37 |
-
torch_dtype=torch.bfloat16,
|
38 |
-
)
|
39 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
40 |
-
|
41 |
path_to_params = hf_hub_download(
|
42 |
repo_id=sae_repo_id,
|
43 |
filename=filename,
|
44 |
force_download=False,
|
|
|
45 |
)
|
46 |
|
|
|
|
|
47 |
params = np.load(path_to_params)
|
48 |
-
pt_params = {k: torch.from_numpy(v)
|
49 |
|
50 |
clf_name = f"linear_classifier_C_{C}_ "+ model_name + "_" + filename.split(".npz")[0]
|
51 |
clf_name = clf_name.replace(os.sep, "_")
|
@@ -53,49 +51,6 @@ clf_name = clf_name.replace(os.sep, "_")
|
|
53 |
with open(f"{clf_name}.pkl", 'rb') as model_file:
|
54 |
clf: LogisticRegression = pickle.load(model_file)
|
55 |
|
56 |
-
import torch.nn as nn
|
57 |
-
class JumpReLUSAE(nn.Module):
|
58 |
-
def __init__(self, d_model, d_sae):
|
59 |
-
# Note that we initialise these to zeros because we're loading in pre-trained weights.
|
60 |
-
# If you want to train your own SAEs then we recommend using blah
|
61 |
-
super().__init__()
|
62 |
-
self.W_enc = nn.Parameter(torch.zeros(d_model, d_sae))
|
63 |
-
self.W_dec = nn.Parameter(torch.zeros(d_sae, d_model))
|
64 |
-
self.threshold = nn.Parameter(torch.zeros(d_sae))
|
65 |
-
self.b_enc = nn.Parameter(torch.zeros(d_sae))
|
66 |
-
self.b_dec = nn.Parameter(torch.zeros(d_model))
|
67 |
-
|
68 |
-
def encode(self, input_acts):
|
69 |
-
pre_acts = input_acts @ self.W_enc + self.b_enc
|
70 |
-
mask = (pre_acts > self.threshold)
|
71 |
-
acts = mask * torch.nn.functional.relu(pre_acts)
|
72 |
-
return acts
|
73 |
-
|
74 |
-
def decode(self, acts):
|
75 |
-
return acts @ self.W_dec + self.b_dec
|
76 |
-
|
77 |
-
def forward(self, acts):
|
78 |
-
acts = self.encode(acts)
|
79 |
-
recon = self.decode(acts)
|
80 |
-
return recon
|
81 |
-
|
82 |
-
sae = JumpReLUSAE(params['W_enc'].shape[0], params['W_enc'].shape[1])
|
83 |
-
sae.load_state_dict(pt_params)
|
84 |
-
sae.to(dtype=torch.bfloat16).cuda()
|
85 |
-
|
86 |
-
@torch.no_grad()
|
87 |
-
def gather_residual_activations(model, target_layer, inputs):
|
88 |
-
target_act = None
|
89 |
-
def gather_target_act_hook(mod, inputs, outputs):
|
90 |
-
nonlocal target_act # make sure we can modify the target_act from the outer scope
|
91 |
-
target_act = outputs[0]
|
92 |
-
return outputs
|
93 |
-
handle = model.model.layers[target_layer].register_forward_hook(gather_target_act_hook)
|
94 |
-
_ = model.forward(inputs)
|
95 |
-
handle.remove()
|
96 |
-
return target_act
|
97 |
-
|
98 |
-
import requests
|
99 |
|
100 |
def get_feature_descriptions(feature):
|
101 |
layer_name = f"{layer}-gemmascope-res-{width}"
|
@@ -127,15 +82,14 @@ topk = 5
|
|
127 |
|
128 |
# Function to wrap in a FastAPI in case of
|
129 |
def get_activations(text):
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
sae_act = sae.encode(target_act)
|
134 |
return sae_act
|
135 |
|
136 |
def get_features(text):
|
137 |
sae_act = get_activations(text)
|
138 |
-
sae_act_aggregated = ((sae_act[:,:,:] > 0).sum(1) > 0).
|
139 |
|
140 |
X = pd.DataFrame(sae_act_aggregated)
|
141 |
|
@@ -159,8 +113,6 @@ def get_features(text):
|
|
159 |
descriptions.append(description)
|
160 |
contrib_df["description"] = descriptions
|
161 |
|
162 |
-
import plotly.graph_objs as go
|
163 |
-
|
164 |
fig = go.Figure(go.Bar(
|
165 |
x=contrib_df['contribution'],
|
166 |
y=contrib_df['description'],
|
@@ -190,17 +142,14 @@ def get_features(text):
|
|
190 |
return classes, fig, dropdown
|
191 |
|
192 |
def get_highlighted_text(text, feature):
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
target_act = gather_residual_activations(model, layer, inputs)
|
197 |
-
sae_act = sae.encode(target_act)
|
198 |
|
199 |
activated_tokens = sae_act[0:,:,feature]
|
200 |
max_activation = activated_tokens.max().item()
|
201 |
activated_tokens /= max_activation
|
202 |
|
203 |
-
activated_tokens = activated_tokens.
|
204 |
|
205 |
output = []
|
206 |
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
2 |
import torch
|
|
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
+
import pickle
|
6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
import requests
|
9 |
+
import os
|
10 |
+
import msgpack_numpy as m
|
11 |
+
import plotly.graph_objs as go
|
12 |
from sklearn.linear_model import LogisticRegression
|
13 |
|
14 |
torch.set_grad_enabled(False) # avoid blowing up mem
|
|
|
24 |
"filename" : "layer_31/width_16k/average_l0_76/params.npz"
|
25 |
}
|
26 |
|
27 |
+
C = 0.01
|
28 |
+
|
29 |
model_name = params["model_name"]
|
30 |
width = params["width"]
|
31 |
layer = params["layer"]
|
|
|
33 |
sae_repo_id = params["sae_repo_id"]
|
34 |
filename = params["filename"]
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
path_to_params = hf_hub_download(
|
37 |
repo_id=sae_repo_id,
|
38 |
filename=filename,
|
39 |
force_download=False,
|
40 |
+
token=os.environ['TOKEN'],
|
41 |
)
|
42 |
|
43 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
44 |
+
|
45 |
params = np.load(path_to_params)
|
46 |
+
pt_params = {k: torch.from_numpy(v) for k, v in params.items()}
|
47 |
|
48 |
clf_name = f"linear_classifier_C_{C}_ "+ model_name + "_" + filename.split(".npz")[0]
|
49 |
clf_name = clf_name.replace(os.sep, "_")
|
|
|
51 |
with open(f"{clf_name}.pkl", 'rb') as model_file:
|
52 |
clf: LogisticRegression = pickle.load(model_file)
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
def get_feature_descriptions(feature):
|
56 |
layer_name = f"{layer}-gemmascope-res-{width}"
|
|
|
82 |
|
83 |
# Function to wrap in a FastAPI in case of
|
84 |
def get_activations(text):
|
85 |
+
response = requests.post("http://34.71.249.22:3000/execute_req", json={"query": text})
|
86 |
+
pack = m.unpackb(response.content)
|
87 |
+
sae_act = torch.from_numpy(pack["sae_act"]).to(dtype=torch.bfloat16)
|
|
|
88 |
return sae_act
|
89 |
|
90 |
def get_features(text):
|
91 |
sae_act = get_activations(text)
|
92 |
+
sae_act_aggregated = ((sae_act[:,:,:] > 0).sum(1) > 0).numpy()
|
93 |
|
94 |
X = pd.DataFrame(sae_act_aggregated)
|
95 |
|
|
|
113 |
descriptions.append(description)
|
114 |
contrib_df["description"] = descriptions
|
115 |
|
|
|
|
|
116 |
fig = go.Figure(go.Bar(
|
117 |
x=contrib_df['contribution'],
|
118 |
y=contrib_df['description'],
|
|
|
142 |
return classes, fig, dropdown
|
143 |
|
144 |
def get_highlighted_text(text, feature):
|
145 |
+
inputs = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True)
|
146 |
+
sae_act = get_activations(text)
|
|
|
|
|
|
|
147 |
|
148 |
activated_tokens = sae_act[0:,:,feature]
|
149 |
max_activation = activated_tokens.max().item()
|
150 |
activated_tokens /= max_activation
|
151 |
|
152 |
+
activated_tokens = activated_tokens.float().numpy()
|
153 |
|
154 |
output = []
|
155 |
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
transformers
|
2 |
+
numpy
|
3 |
+
torch
|
4 |
+
pandas
|
5 |
+
scikit-learn
|
6 |
+
msgpack-numpy
|