Spaces:

h2oai
/

h2ogpt-chatbot

Runtime error

App Files Files Community

pseudotensor commited on Apr 23, 2023

Commit

83d9f95

1 Parent(s): 8910711

Update with h2oGPT hash d2fec0293c2259c210f6d808282cb70b2466130b

Browse files

Files changed (1) hide show

app.py +54 -37

app.py CHANGED Viewed

@@ -34,6 +34,7 @@ admin_pass = os.getenv("ADMIN_PASS")
 # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
 raise_generate_gpu_exceptions = True
 def main(
         load_8bit: bool = False,
@@ -144,12 +145,12 @@ def main(
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
-            filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
-            if not os.path.isfile(filename):
                 os.system(
-                    'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % filename)
             import json
-            data = json.load(open(filename, 'rt'))
             # focus on data that starts with human, else likely chopped from other data
             turn_start = 0  # odd in general
             data = [x for x in data if len(x['conversations']) > turn_start + 1 and
@@ -165,12 +166,29 @@ def main(
                 assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
                 output = data[i]['conversations'][turn_start + 1]['value']
                 examplenew = example1.copy()
-                examplenew[0] = instruction
-                examplenew[1] = ''  # no input
-                examplenew[2] = ''  # no context
                 examples.append(examplenew)
                 responses.append(output)
         with torch.device("cuda"):
             # ensure was set right above before examples generated
             assert not stream_output, "stream_output=True does not make sense with example loop"
@@ -183,7 +201,7 @@ def main(
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
-                fun = partial(evaluate, model_state, debug=debug, chat=chat, save_dir=save_dir)
             else:
                 assert eval_sharegpt_prompts_only > 0
@@ -194,15 +212,17 @@ def main(
                 fun = get_response
             t0 = time.time()
             score_dump = []
-            num_examples = len(examples)
             import matplotlib.pyplot as plt
             for exi, ex in enumerate(examples):
                 clear_torch_cache()
                 print("")
                 print("START" + "=" * 100)
-                print("Question: %s %s" % (ex[0], ('input=%s' % ex[1] if ex[1] else '')))
                 print("-" * 105)
                 # fun yields as generator, so have to iterate over it
                 # Also means likely do NOT want --stream_output=True, else would show all generations
@@ -211,14 +231,14 @@ def main(
                     if smodel:
                         score_with_prompt = False
                         if score_with_prompt:
-                            data_point = dict(instruction=ex[0], input=ex[1])
                             prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
                             prompt = prompter.generate_prompt(data_point)
                         else:
                             # just raw input and output
-                            assert ex[1] in [None, '']  # should be no iinput
-                            assert ex[2] in [None, '']  # should be no context
-                            prompt = ex[0]
                         cutoff_len = 768 if is_low_mem else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
@@ -246,30 +266,16 @@ def main(
                         print("SCORE %s: %s" % (exi, score), flush=True)
                         score_dump.append(ex + [prompt, res, score])
                         # dump every score in case abort
-                        scoring_path = 'scoring'
-                        os.makedirs(scoring_path, exist_ok=True)
-                        if eval_sharegpt_as_output:
-                            used_base_model = 'gpt35'
-                            used_lora_weights = ''
-                        else:
-                            used_base_model = str(base_model.split('/')[-1])
-                            used_lora_weights = str(lora_weights.split('/')[-1])
                         df_scores = pd.DataFrame(score_dump,
-                                                 columns=eval_func_param_names + ['prompt', 'response', 'score'])
-                        filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
-                                                                            eval_sharegpt_prompts_only_seed,
-                                                                            eval_sharegpt_as_output,
-                                                                            used_base_model,
-                                                                            used_lora_weights)
-                        filename = os.path.join(scoring_path, filename)
-                        df_scores.to_parquet(filename, index=False)
                         # plot histogram so far
                         plt.figure(figsize=(10, 10))
                         plt.hist(df_scores['score'], bins=20)
                         score_avg = np.mean(df_scores['score'])
                         score_median = np.median(df_scores['score'])
                         plt.title("Score avg: %s median: %s" % (score_avg, score_median))
-                        plt.savefig(filename.replace('.parquet', '.png'))
                         plt.close()
                 print("END" + "=" * 102)
@@ -278,7 +284,8 @@ def main(
                 print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
             t1 = time.time()
             print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
-        return
     if gradio:
         go_gradio(**locals())
@@ -774,7 +781,7 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                                                        visible=not is_public and False)
                             do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
                                                     value=kwargs['do_sample'])
-                            temperature = gr.Slider(minimum=0, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
                                                     info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
@@ -984,6 +991,11 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                 instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
                 question = args_list[instruction_nochat_arg_id]
             question = question[-cutoff_len:]
             answer = answer[-cutoff_len:]
@@ -1307,10 +1319,12 @@ body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
         if not is_public:
             load_model_event = load_model_button.click(**load_model_args) \
                 .then(**prompt_update_args) \
                 .then(**chatbot_update_args) \
                 .then(clear_torch_cache)
         load_model_args2 = dict(fn=load_model,
@@ -1735,6 +1749,7 @@ def get_generate_params(model_lower, chat,
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
     if show_examples is None:
         if chat:
             show_examples = False
@@ -1831,6 +1846,7 @@ Philipp: ok, ok you can find everything here. https://huggingface.co/blog/the-pa
         repetition_penalty = repetition_penalty or 1.07
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
     params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
                    early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
@@ -1874,10 +1890,11 @@ y = np.random.randint(0, 1, 100)
     src_lang = "English"
     tgt_lang = "Russian"
-    # adjust examples if non-chat mode
-    if not chat:
-        # move to correct position
-        for example in examples:
             example[eval_func_param_names.index('instruction_nochat')] = example[
                 eval_func_param_names.index('instruction')]
             example[eval_func_param_names.index('instruction')] = ''

 # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
 raise_generate_gpu_exceptions = True
+eval_extra_columns = ['prompt', 'response', 'score']
 def main(
         load_8bit: bool = False,
     if not gradio:
         if eval_sharegpt_prompts_only > 0:
             # override default examples with shareGPT ones for human-level eval purposes only
+            eval_filename = 'ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json'
+            if not os.path.isfile(eval_filename):
                 os.system(
+                    'wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/%s' % eval_filename)
             import json
+            data = json.load(open(eval_filename, 'rt'))
             # focus on data that starts with human, else likely chopped from other data
             turn_start = 0  # odd in general
             data = [x for x in data if len(x['conversations']) > turn_start + 1 and
                 assert data[i]['conversations'][turn_start + 1]['from'] == 'gpt'
                 output = data[i]['conversations'][turn_start + 1]['value']
                 examplenew = example1.copy()
+                assert not chat, "No gradio must use chat=False, uses nochat isntruct"
+                examplenew[eval_func_param_names.index('instruction_nochat')] = instruction
+                examplenew[eval_func_param_names.index('iinput_nochat')] = ''  # no input
+                examplenew[eval_func_param_names.index('context')] = ''  # no context
                 examples.append(examplenew)
                 responses.append(output)
+        num_examples = len(examples)
+        scoring_path = 'scoring'
+        os.makedirs(scoring_path, exist_ok=True)
+        if eval_sharegpt_as_output:
+            used_base_model = 'gpt35'
+            used_lora_weights = ''
+        else:
+            used_base_model = str(base_model.split('/')[-1])
+            used_lora_weights = str(lora_weights.split('/')[-1])
+        eval_filename = "df_scores_%s_%s_%s_%s_%s_%s.parquet" % (num_examples, eval_sharegpt_prompts_only,
+                                                                 eval_sharegpt_prompts_only_seed,
+                                                                 eval_sharegpt_as_output,
+                                                                 used_base_model,
+                                                                 used_lora_weights)
+        eval_filename = os.path.join(scoring_path, eval_filename)
         with torch.device("cuda"):
             # ensure was set right above before examples generated
             assert not stream_output, "stream_output=True does not make sense with example loop"
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
+                fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir)
             else:
                 assert eval_sharegpt_prompts_only > 0
                 fun = get_response
             t0 = time.time()
             score_dump = []
             import matplotlib.pyplot as plt
             for exi, ex in enumerate(examples):
+                instruction = ex[eval_func_param_names.index('instruction_nochat')]
+                iinput = ex[eval_func_param_names.index('iinput_nochat')]
+                context = ex[eval_func_param_names.index('context')]
                 clear_torch_cache()
                 print("")
                 print("START" + "=" * 100)
+                print("Question: %s %s" % (instruction, ('input=%s' % iinput if iinput else '')))
                 print("-" * 105)
                 # fun yields as generator, so have to iterate over it
                 # Also means likely do NOT want --stream_output=True, else would show all generations
                     if smodel:
                         score_with_prompt = False
                         if score_with_prompt:
+                            data_point = dict(instruction=instruction, input=iinput, context=context)
                             prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
                             prompt = prompter.generate_prompt(data_point)
                         else:
                             # just raw input and output
+                            assert iinput in [None, '']  # should be no iinput
+                            assert context in [None, '']  # should be no context
+                            prompt = instruction
                         cutoff_len = 768 if is_low_mem else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
                         print("SCORE %s: %s" % (exi, score), flush=True)
                         score_dump.append(ex + [prompt, res, score])
                         # dump every score in case abort
                         df_scores = pd.DataFrame(score_dump,
+                                                 columns=eval_func_param_names + eval_extra_columns)
+                        df_scores.to_parquet(eval_filename, index=False)
                         # plot histogram so far
                         plt.figure(figsize=(10, 10))
                         plt.hist(df_scores['score'], bins=20)
                         score_avg = np.mean(df_scores['score'])
                         score_median = np.median(df_scores['score'])
                         plt.title("Score avg: %s median: %s" % (score_avg, score_median))
+                        plt.savefig(eval_filename.replace('.parquet', '.png'))
                         plt.close()
                 print("END" + "=" * 102)
                 print("Time taken so far: %.4f about %.4g per example" % (t2 - t0, (t2 - t0) / (1 + exi)))
             t1 = time.time()
             print("Total time taken: %.4f about %.4g per example" % (t1 - t0, (t1 - t0) / num_examples))
+        return eval_filename
     if gradio:
         go_gradio(**locals())
                                                        visible=not is_public and False)
                             do_sample = gr.Checkbox(label="Sample", info="Enable sampler, required for use of temperature, top_p, top_k",
                                                     value=kwargs['do_sample'])
+                            temperature = gr.Slider(minimum=0.01, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
                                                     info="Lower is deterministic (but may lead to repeats), Higher more creative (but may lead to hallucinations)")
                 instruction_nochat_arg_id = eval_func_param_names.index('instruction_nochat')
                 question = args_list[instruction_nochat_arg_id]
+            if question is None:
+                return 'Response Score: Bad Question'
+            if answer is None:
+                return 'Response Score: Bad Answer'
             question = question[-cutoff_len:]
             answer = answer[-cutoff_len:]
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
+        nochat_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output_nochat)
         if not is_public:
             load_model_event = load_model_button.click(**load_model_args) \
                 .then(**prompt_update_args) \
                 .then(**chatbot_update_args) \
+                .then(**nochat_update_args) \
                 .then(clear_torch_cache)
         load_model_args2 = dict(fn=load_model,
     if not prompt_type and model_lower in inv_prompt_type_to_model_lower:
         prompt_type = inv_prompt_type_to_model_lower[model_lower]
+    # examples at first don't include chat, instruction_nochat, iinput_nochat, added at end
     if show_examples is None:
         if chat:
             show_examples = False
         repetition_penalty = repetition_penalty or 1.07
         num_return_sequences = min(num_beams, num_return_sequences or 1)
         do_sample = False if do_sample is None else do_sample
+    # doesn't include chat, instruction_nochat, iinput_nochat, added later
     params_list = ["", stream_output, prompt_type, temperature, top_p, top_k, num_beams, max_new_tokens, min_new_tokens,
                    early_stopping, max_time, repetition_penalty, num_return_sequences, do_sample]
     src_lang = "English"
     tgt_lang = "Russian"
+    # move to correct position
+    for example in examples:
+        example += [chat, '', '']
+        # adjust examples if non-chat mode
+        if not chat:
             example[eval_func_param_names.index('instruction_nochat')] = example[
                 eval_func_param_names.index('instruction')]
             example[eval_func_param_names.index('instruction')] = ''