Spaces:
Runtime error
Runtime error
pseudotensor
commited on
Commit
·
83d9f95
1
Parent(s):
8910711
Update with h2oGPT hash d2fec0293c2259c210f6d808282cb70b2466130b
Browse files
app.py
CHANGED
@@ -34,6 +34,7 @@ admin_pass = os.getenv("ADMIN_PASS")
|
|
34 |
# will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
|
35 |
raise_generate_gpu_exceptions = True
|
36 |
|
|
|
37 |
|
38 |
def main(
|
39 |
load_8bit: bool = False,
|
@@ -144,12 +145,12 @@ def main(
|
|
144 |
if not gradio:
|
145 |
if eval_sharegpt_prompts_only > 0:
|
146 |
# override default examples with shareGPT ones for human-level eval purposes only
|
147 |
-
|
148 |
-
if not os.path.isfile(
|
149 |
os.system(
|
150 |
-
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' %
|
151 |
import json
|
152 |
-
data = json.load(open(
|
153 |
# focus on data that starts with human, else likely chopped from other data
|
154 |
turn_start = 0 # odd in general
|
155 |
data = [x for x in data if len(x['conversations']) > turn_start + 1 and
|
@@ -165,12 +166,29 @@ def main(
|
|
165 |
assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
|
166 |
output = data[i]['conversations'][turn_start + 1]['value']
|
167 |
examplenew = example1.copy()
|
168 |
-
|
169 |
-
examplenew[
|
170 |
-
examplenew[
|
|
|
171 |
examples.append(examplenew)
|
172 |
responses.append(output)
|
173 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
with torch.device("cuda"):
|
175 |
# ensure was set right above before examples generated
|
176 |
assert not stream_output, "stream_output=True does not make sense with example loop"
|
@@ -183,7 +201,7 @@ def main(
|
|
183 |
if not eval_sharegpt_as_output:
|
184 |
model, tokenizer, device = get_model(**locals())
|
185 |
model_state = [model, tokenizer, device, base_model]
|
186 |
-
fun = partial(evaluate, model_state, debug=debug,
|
187 |
else:
|
188 |
assert eval_sharegpt_prompts_only > 0
|
189 |
|
@@ -194,15 +212,17 @@ def main(
|
|
194 |
fun = get_response
|
195 |
t0 = time.time()
|
196 |
score_dump = []
|
197 |
-
num_examples = len(examples)
|
198 |
|
199 |
import matplotlib.pyplot as plt
|
200 |
|
201 |
for exi, ex in enumerate(examples):
|
|
|
|
|
|
|
202 |
clear_torch_cache()
|
203 |
print("")
|
204 |
print("START" + "=" * 100)
|
205 |
-
print("Question: %s %s" % (
|
206 |
print("-" * 105)
|
207 |
# fun yields as generator, so have to iterate over it
|
208 |
# Also means likely do NOT want --stream_output=True, else would show all generations
|
@@ -211,14 +231,14 @@ def main(
|
|
211 |
if smodel:
|
212 |
score_with_prompt = False
|
213 |
if score_with_prompt:
|
214 |
-
data_point = dict(instruction=
|
215 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
216 |
prompt = prompter.generate_prompt(data_point)
|
217 |
else:
|
218 |
# just raw input and output
|
219 |
-
assert
|
220 |
-
assert
|
221 |
-
prompt =
|
222 |
cutoff_len = 768 if is_low_mem else 2048
|
223 |
inputs = stokenizer(prompt, res,
|
224 |
return_tensors="pt",
|
@@ -246,30 +266,16 @@ def main(
|
|
246 |
print("SCORE %s: %s" % (exi, score), flush=True)
|
247 |
score_dump.append(ex + [prompt, res, score])
|
248 |
# dump every score in case abort
|
249 |
-
scoring_path = 'scoring'
|
250 |
-
os.makedirs(scoring_path, exist_ok=True)
|
251 |
-
if eval_sharegpt_as_output:
|
252 |
-
used_base_model = 'gpt35'
|
253 |
-
used_lora_weights = ''
|
254 |
-
else:
|
255 |
-
used_base_model = str(base_model.split('/')[-1])
|
256 |
-
used_lora_weights = str(lora_weights.split('/')[-1])
|
257 |
df_scores = pd.DataFrame(score_dump,
|
258 |
-
columns=eval_func_param_names +
|
259 |
-
|
260 |
-
eval_sharegpt_prompts_only_seed,
|
261 |
-
eval_sharegpt_as_output,
|
262 |
-
used_base_model,
|
263 |
-
used_lora_weights)
|
264 |
-
filename = os.path.join(scoring_path, filename)
|
265 |
-
df_scores.to_parquet(filename, index=False)
|
266 |
# plot histogram so far
|
267 |
plt.figure(figsize=(10, 10))
|
268 |
plt.hist(df_scores['score'], bins=20)
|
269 |
score_avg = np.mean(df_scores['score'])
|
270 |
score_median = np.median(df_scores['score'])
|
271 |
plt.title("Score avg: %s median: %s" % (score_avg, score_median))
|
272 |
-
plt.savefig(
|
273 |
plt.close()
|
274 |
|
275 |
print("END" + "=" * 102)
|
@@ -278,7 +284,8 @@ def main(
|
|
278 |
print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
|
279 |
t1 = time.time()
|
280 |
print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
|
281 |
-
return
|
|
|
282 |
if gradio:
|
283 |
go_gradio(**locals())
|
284 |
|
@@ -774,7 +781,7 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
|
|
774 |
visible=not is_public and False)
|
775 |
do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
|
776 |
value=kwargs['do_sample'])
|
777 |
-
temperature = gr.Slider(minimum=0, maximum=3,
|
778 |
value=kwargs['temperature'],
|
779 |
label="Temperature",
|
780 |
info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
|
@@ -984,6 +991,11 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
|
|
984 |
instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
|
985 |
question = args_list[instruction_nochat_arg_id]
|
986 |
|
|
|
|
|
|
|
|
|
|
|
987 |
question = question[-cutoff_len:]
|
988 |
answer = answer[-cutoff_len:]
|
989 |
|
@@ -1307,10 +1319,12 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
|
|
1307 |
outputs=[model_state, model_used, lora_used, prompt_type])
|
1308 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
1309 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
|
|
1310 |
if not is_public:
|
1311 |
load_model_event = load_model_button.click(**load_model_args) \
|
1312 |
.then(**prompt_update_args) \
|
1313 |
.then(**chatbot_update_args) \
|
|
|
1314 |
.then(clear_torch_cache)
|
1315 |
|
1316 |
load_model_args2 = dict(fn=load_model,
|
@@ -1735,6 +1749,7 @@ def get_generate_params(model_lower, chat,
|
|
1735 |
if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
|
1736 |
prompt_type = inv_prompt_type_to_model_lower[model_lower]
|
1737 |
|
|
|
1738 |
if show_examples is None:
|
1739 |
if chat:
|
1740 |
show_examples = False
|
@@ -1831,6 +1846,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
|
|
1831 |
repetition_penalty = repetition_penalty or 1.07
|
1832 |
num_return_sequences = min(num_beams, num_return_sequences or 1)
|
1833 |
do_sample = False if do_sample is None else do_sample
|
|
|
1834 |
params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
|
1835 |
early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
|
1836 |
|
@@ -1874,10 +1890,11 @@ y = np.random.randint(0, 1, 100)
|
|
1874 |
src_lang = "English"
|
1875 |
tgt_lang = "Russian"
|
1876 |
|
1877 |
-
#
|
1878 |
-
|
1879 |
-
|
1880 |
-
|
|
|
1881 |
example[eval_func_param_names.index('instruction_nochat')] = example[
|
1882 |
eval_func_param_names.index('instruction')]
|
1883 |
example[eval_func_param_names.index('instruction')] = ''
|
|
|
34 |
# will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
|
35 |
raise_generate_gpu_exceptions = True
|
36 |
|
37 |
+
eval_extra_columns = ['prompt', 'response', 'score']
|
38 |
|
39 |
def main(
|
40 |
load_8bit: bool = False,
|
|
|
145 |
if not gradio:
|
146 |
if eval_sharegpt_prompts_only > 0:
|
147 |
# override default examples with shareGPT ones for human-level eval purposes only
|
148 |
+
eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
|
149 |
+
if not os.path.isfile(eval_filename):
|
150 |
os.system(
|
151 |
+
'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
|
152 |
import json
|
153 |
+
data = json.load(open(eval_filename, 'rt'))
|
154 |
# focus on data that starts with human, else likely chopped from other data
|
155 |
turn_start = 0 # odd in general
|
156 |
data = [x for x in data if len(x['conversations']) > turn_start + 1 and
|
|
|
166 |
assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
|
167 |
output = data[i]['conversations'][turn_start + 1]['value']
|
168 |
examplenew = example1.copy()
|
169 |
+
assert not chat, "No gradio must use chat=False, uses nochat isntruct"
|
170 |
+
examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
|
171 |
+
examplenew[eval_func_param_names.index('iinput_nochat')] = '' # no input
|
172 |
+
examplenew[eval_func_param_names.index('context')] = '' # no context
|
173 |
examples.append(examplenew)
|
174 |
responses.append(output)
|
175 |
|
176 |
+
num_examples = len(examples)
|
177 |
+
scoring_path = 'scoring'
|
178 |
+
os.makedirs(scoring_path, exist_ok=True)
|
179 |
+
if eval_sharegpt_as_output:
|
180 |
+
used_base_model = 'gpt35'
|
181 |
+
used_lora_weights = ''
|
182 |
+
else:
|
183 |
+
used_base_model = str(base_model.split('/')[-1])
|
184 |
+
used_lora_weights = str(lora_weights.split('/')[-1])
|
185 |
+
eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
|
186 |
+
eval_sharegpt_prompts_only_seed,
|
187 |
+
eval_sharegpt_as_output,
|
188 |
+
used_base_model,
|
189 |
+
used_lora_weights)
|
190 |
+
eval_filename = os.path.join(scoring_path, eval_filename)
|
191 |
+
|
192 |
with torch.device("cuda"):
|
193 |
# ensure was set right above before examples generated
|
194 |
assert not stream_output, "stream_output=True does not make sense with example loop"
|
|
|
201 |
if not eval_sharegpt_as_output:
|
202 |
model, tokenizer, device = get_model(**locals())
|
203 |
model_state = [model, tokenizer, device, base_model]
|
204 |
+
fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
|
205 |
else:
|
206 |
assert eval_sharegpt_prompts_only > 0
|
207 |
|
|
|
212 |
fun = get_response
|
213 |
t0 = time.time()
|
214 |
score_dump = []
|
|
|
215 |
|
216 |
import matplotlib.pyplot as plt
|
217 |
|
218 |
for exi, ex in enumerate(examples):
|
219 |
+
instruction = ex[eval_func_param_names.index('instruction_nochat')]
|
220 |
+
iinput = ex[eval_func_param_names.index('iinput_nochat')]
|
221 |
+
context = ex[eval_func_param_names.index('context')]
|
222 |
clear_torch_cache()
|
223 |
print("")
|
224 |
print("START" + "=" * 100)
|
225 |
+
print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
|
226 |
print("-" * 105)
|
227 |
# fun yields as generator, so have to iterate over it
|
228 |
# Also means likely do NOT want --stream_output=True, else would show all generations
|
|
|
231 |
if smodel:
|
232 |
score_with_prompt = False
|
233 |
if score_with_prompt:
|
234 |
+
data_point = dict(instruction=instruction, input=iinput, context=context)
|
235 |
prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
|
236 |
prompt = prompter.generate_prompt(data_point)
|
237 |
else:
|
238 |
# just raw input and output
|
239 |
+
assert iinput in [None, ''] # should be no iinput
|
240 |
+
assert context in [None, ''] # should be no context
|
241 |
+
prompt = instruction
|
242 |
cutoff_len = 768 if is_low_mem else 2048
|
243 |
inputs = stokenizer(prompt, res,
|
244 |
return_tensors="pt",
|
|
|
266 |
print("SCORE %s: %s" % (exi, score), flush=True)
|
267 |
score_dump.append(ex + [prompt, res, score])
|
268 |
# dump every score in case abort
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
df_scores = pd.DataFrame(score_dump,
|
270 |
+
columns=eval_func_param_names + eval_extra_columns)
|
271 |
+
df_scores.to_parquet(eval_filename, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
# plot histogram so far
|
273 |
plt.figure(figsize=(10, 10))
|
274 |
plt.hist(df_scores['score'], bins=20)
|
275 |
score_avg = np.mean(df_scores['score'])
|
276 |
score_median = np.median(df_scores['score'])
|
277 |
plt.title("Score avg: %s median: %s" % (score_avg, score_median))
|
278 |
+
plt.savefig(eval_filename.replace('.parquet', '.png'))
|
279 |
plt.close()
|
280 |
|
281 |
print("END" + "=" * 102)
|
|
|
284 |
print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
|
285 |
t1 = time.time()
|
286 |
print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
|
287 |
+
return eval_filename
|
288 |
+
|
289 |
if gradio:
|
290 |
go_gradio(**locals())
|
291 |
|
|
|
781 |
visible=not is_public and False)
|
782 |
do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
|
783 |
value=kwargs['do_sample'])
|
784 |
+
temperature = gr.Slider(minimum=0.01, maximum=3,
|
785 |
value=kwargs['temperature'],
|
786 |
label="Temperature",
|
787 |
info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
|
|
|
991 |
instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
|
992 |
question = args_list[instruction_nochat_arg_id]
|
993 |
|
994 |
+
if question is None:
|
995 |
+
return 'Response Score: Bad Question'
|
996 |
+
if answer is None:
|
997 |
+
return 'Response Score: Bad Answer'
|
998 |
+
|
999 |
question = question[-cutoff_len:]
|
1000 |
answer = answer[-cutoff_len:]
|
1001 |
|
|
|
1319 |
outputs=[model_state, model_used, lora_used, prompt_type])
|
1320 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
1321 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
1322 |
+
nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
|
1323 |
if not is_public:
|
1324 |
load_model_event = load_model_button.click(**load_model_args) \
|
1325 |
.then(**prompt_update_args) \
|
1326 |
.then(**chatbot_update_args) \
|
1327 |
+
.then(**nochat_update_args) \
|
1328 |
.then(clear_torch_cache)
|
1329 |
|
1330 |
load_model_args2 = dict(fn=load_model,
|
|
|
1749 |
if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
|
1750 |
prompt_type = inv_prompt_type_to_model_lower[model_lower]
|
1751 |
|
1752 |
+
# examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
|
1753 |
if show_examples is None:
|
1754 |
if chat:
|
1755 |
show_examples = False
|
|
|
1846 |
repetition_penalty = repetition_penalty or 1.07
|
1847 |
num_return_sequences = min(num_beams, num_return_sequences or 1)
|
1848 |
do_sample = False if do_sample is None else do_sample
|
1849 |
+
# doesn't include chat, instruction_nochat, iinput_nochat, added later
|
1850 |
params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
|
1851 |
early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
|
1852 |
|
|
|
1890 |
src_lang = "English"
|
1891 |
tgt_lang = "Russian"
|
1892 |
|
1893 |
+
# move to correct position
|
1894 |
+
for example in examples:
|
1895 |
+
example += [chat, '', '']
|
1896 |
+
# adjust examples if non-chat mode
|
1897 |
+
if not chat:
|
1898 |
example[eval_func_param_names.index('instruction_nochat')] = example[
|
1899 |
eval_func_param_names.index('instruction')]
|
1900 |
example[eval_func_param_names.index('instruction')] = ''
|