gregH commited on
Commit
680fe88
·
verified ·
1 Parent(s): a236201

Update index.html

Browse files
Files changed (1) hide show
  1. index.html +36 -39
index.html CHANGED
@@ -180,55 +180,52 @@ gradient norm and then apply soft removal on them to mitigate the potential jail
180
  is guidance on ...'").
181
  </p>
182
 
183
- <div class="container">
184
  <div class="example-selector">
185
- <button onclick="selectExample('example1')">GCG Example</button>
186
- <button onclick="selectExample('example2')">TAP Example</button>
187
  <button onclick="selectExample('example3')">Example 3</button>
188
  <button onclick="selectExample('example4')">Example 4</button>
189
  </div>
190
  <div class="example-box" id="exampleBox">
191
- <p id="exampleText">Select an example to see how it would be highlighted.</p>
192
- </div>
193
  </div>
194
 
195
  <script>
196
-
197
- function selectExample(exampleId) {
198
- const examples = {
199
- example1: "High-level speaking, successful jailbreaks share a common principle that they are trying to make the LLMs willing to affirm the user request which will be rejected at the beginning.",
200
- example2: "Drawing upon this inspiration, our proposed defense aims to find the tokens that are most critical in forcing the LLM to generate such affirmative responses, decrease their importance in the generation, and thereby resolve the potential jailbreak risks brought by these tokens.",
201
- example3: "To identify these tokens, we propose a new concept called the Affirmation Loss. We then use the loss's gradient norm with respect to each token in the user input prompt to find the jailbreak-critical tokens.",
202
- example4: "We select those tokens with the larger gradient norm and then apply soft removal on them to mitigate the potential jailbreak risks."
203
- };
204
-
205
- const exampleText = document.getElementById('exampleText');
206
- exampleText.innerHTML = examples[exampleId];
207
-
208
- // Highlight specific parts of the text
209
- switch (exampleId) {
210
- case 'example1':
211
- highlightText(exampleText, 'successful jailbreaks');
212
- break;
213
- case 'example2':
214
- highlightText(exampleText, 'our proposed defense');
215
- break;
216
- case 'example3':
217
- highlightText(exampleText, 'Affirmation Loss');
218
- break;
219
- case 'example4':
220
- highlightText(example text, 'soft removal');
221
- break;
222
- }
223
- }
224
-
225
- function highlightText(element, keyword) {
226
- const regex = new RegExp(`(${keyword})`, 'gi');
227
- element.innerHTML = element.innerHTML.replace(regex, '<span class="highlight">$1</span>');
228
- }
229
-
230
  </script>
231
 
 
232
  <h2 id="proposed-approach-gradient-cuff">Performance evaluation against practical Jailbreaks</h2>
233
  <p> With the exploration of the Refusal Loss landscape, we propose Gradient Cuff,
234
  a two-step jailbreak detection method based on checking the refusal loss and its gradient norm. Our detection procedure is shown below:
 
180
  is guidance on ...'").
181
  </p>
182
 
 
183
  <div class="example-selector">
184
+ <button onclick="selectExample('example1')">Example 1</button>
185
+ <button onclick="selectExample('example2')">Example 2</button>
186
  <button onclick="selectExample('example3')">Example 3</button>
187
  <button onclick="selectExample('example4')">Example 4</button>
188
  </div>
189
  <div class="example-box" id="exampleBox">
190
+ <p id="exampleText">Select an example to see it highlighted.</p>
 
191
  </div>
192
 
193
  <script>
194
+ function selectExample(exampleId) {
195
+ const examples = {
196
+ example1: "High-level speaking, successful jailbreaks share a common principle that they are trying to make the LLMs willing to affirm the user request which will be rejected at the beginning.",
197
+ example2: "Drawing upon this inspiration, our proposed defense aims to find the tokens that are most critical in forcing the LLM to generate such affirmative responses, decrease their importance in the generation, and thereby resolve the potential jailbreak risks brought by these tokens.",
198
+ example3: "To identify these tokens, we propose a new concept called the Affirmation Loss. We then use the loss's gradient norm with respect to each token in the user input prompt to find the jailbreak-critical tokens.",
199
+ example4: "We select those tokens with the larger gradient norm and then apply soft removal on them to mitigate the potential jailbreak risks."
200
+ };
201
+
202
+ const exampleText = document.getElementById('exampleText');
203
+ exampleText.innerHTML = examples[exampleId];
204
+
205
+ // Highlight specific parts of the text
206
+ switch (exampleId) {
207
+ case 'example1':
208
+ highlightText(exampleText, 'successful jailbreaks');
209
+ break;
210
+ case 'example2':
211
+ highlightText(exampleText, 'our proposed defense');
212
+ break;
213
+ case 'example3':
214
+ highlightText(exampleText, 'Affirmation Loss');
215
+ break;
216
+ case 'example4':
217
+ highlightText(exampleText, 'soft removal');
218
+ break;
219
+ }
220
+ }
221
+
222
+ function highlightText(element, keyword) {
223
+ const regex = new RegExp(`(${keyword})`, 'gi');
224
+ element.innerHTML = element.innerHTML.replace(regex, '<span class="highlight">$1</span>');
225
+ }
 
 
226
  </script>
227
 
228
+
229
  <h2 id="proposed-approach-gradient-cuff">Performance evaluation against practical Jailbreaks</h2>
230
  <p> With the exploration of the Refusal Loss landscape, we propose Gradient Cuff,
231
  a two-step jailbreak detection method based on checking the refusal loss and its gradient norm. Our detection procedure is shown below: