Spaces:
Running
Running
pseudotensor
commited on
Commit
•
0a5ce48
1
Parent(s):
190bc9c
Update with h2ogpt hash 24c76a5944a7bc0ee6249ecab5ff915592771e88
Browse files
app.py
CHANGED
@@ -27,6 +27,11 @@ from finetune import get_loaders, example_data_points, generate_prompt, get_gith
|
|
27 |
human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
|
28 |
from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
|
29 |
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def main(
|
32 |
load_8bit: bool = False,
|
@@ -90,15 +95,22 @@ def main(
|
|
90 |
):
|
91 |
# allow set token directly
|
92 |
use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
|
93 |
-
|
94 |
-
if
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
temperature = 0.7
|
99 |
-
top_p = 1
|
100 |
-
top_k = 100
|
101 |
do_sample = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
# get defaults
|
104 |
model_lower = base_model.lower()
|
@@ -202,7 +214,7 @@ def main(
|
|
202 |
assert ex[1] in [None, ''] # should be no iinput
|
203 |
assert ex[2] in [None, ''] # should be no context
|
204 |
prompt = ex[0]
|
205 |
-
cutoff_len = 768 if
|
206 |
inputs = stokenizer(prompt, res,
|
207 |
return_tensors="pt",
|
208 |
truncation=True,
|
@@ -526,11 +538,11 @@ def go_gradio(**kwargs):
|
|
526 |
"""
|
527 |
else:
|
528 |
description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
|
529 |
-
if
|
530 |
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
|
531 |
if kwargs['load_8bit']:
|
532 |
-
description += """<i><li> Model is loaded in 8-bit and
|
533 |
-
description += """<i><li>Model loading and unloading disabled
|
534 |
|
535 |
if kwargs['verbose']:
|
536 |
task_info_md = f"""
|
@@ -617,7 +629,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
617 |
{description}
|
618 |
{task_info_md}
|
619 |
""")
|
620 |
-
if
|
621 |
gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
|
622 |
|
623 |
# go button visible if
|
@@ -685,7 +697,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
685 |
value=kwargs['stream_output'])
|
686 |
prompt_type = gr.Dropdown(prompt_types_strings,
|
687 |
value=kwargs['prompt_type'], label="Prompt Type",
|
688 |
-
visible=not
|
689 |
temperature = gr.Slider(minimum=0, maximum=3,
|
690 |
value=kwargs['temperature'],
|
691 |
label="Temperature",
|
@@ -698,12 +710,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
698 |
value=kwargs['top_k'], label="Top k",
|
699 |
info='Num. tokens to sample from'
|
700 |
)
|
701 |
-
max_beams = 8 if not
|
702 |
num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
|
703 |
value=min(max_beams, kwargs['num_beams']), label="Beams",
|
704 |
info="Number of searches for optimal overall probability. "
|
705 |
"Uses more GPU memory/compute")
|
706 |
-
max_max_new_tokens = 2048 if not
|
707 |
max_new_tokens = gr.Slider(
|
708 |
minimum=1, maximum=max_max_new_tokens, step=1,
|
709 |
value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
|
@@ -714,7 +726,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
714 |
)
|
715 |
early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
|
716 |
value=kwargs['early_stopping'])
|
717 |
-
max_max_time = 60 * 5 if not
|
718 |
max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
|
719 |
value=min(max_max_time, kwargs['max_time']), label="Max. time",
|
720 |
info="Max. time to search optimal output.")
|
@@ -724,17 +736,17 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
724 |
num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
|
725 |
value=kwargs['num_return_sequences'],
|
726 |
label="Number Returns", info="Must be <= num_beams",
|
727 |
-
visible=not
|
728 |
do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
|
729 |
value=kwargs['do_sample'])
|
730 |
if kwargs['chat']:
|
731 |
iinput = gr.Textbox(lines=4, label="Input",
|
732 |
placeholder=kwargs['placeholder_input'],
|
733 |
-
visible=not
|
734 |
# nominally empty for chat mode
|
735 |
context = gr.Textbox(lines=1, label="Context",
|
736 |
info="Ignored in chat mode.",
|
737 |
-
visible=not
|
738 |
|
739 |
with gr.TabItem("Models"):
|
740 |
with gr.Row():
|
@@ -744,8 +756,8 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
744 |
model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
|
745 |
lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
|
746 |
with gr.Column(scale=1):
|
747 |
-
load_msg = "Load Model/LORA" if not
|
748 |
-
else "LOAD DISABLED
|
749 |
load_model_button = gr.Button(load_msg)
|
750 |
model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
|
751 |
lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
|
@@ -811,7 +823,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
811 |
len(history[-1]) >= 2:
|
812 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
813 |
|
814 |
-
max_length_tokenize = 512 if
|
815 |
cutoff_len = max_length_tokenize*4 # restrict deberta related to max for LLM
|
816 |
|
817 |
question = history[-1][0]
|
@@ -1025,7 +1037,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
|
|
1025 |
outputs=[model_state, model_used, lora_used, prompt_type])
|
1026 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
1027 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
1028 |
-
if not
|
1029 |
load_model_event = load_model_button.click(**load_model_args) \
|
1030 |
.then(**prompt_update_args) \
|
1031 |
.then(**chatbot_update_args) \
|
@@ -1243,7 +1255,7 @@ def evaluate(
|
|
1243 |
# RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
|
1244 |
# RuntimeError: expected scalar type Half but found Float
|
1245 |
# with - 256
|
1246 |
-
max_length_tokenize = 768 - 256 if
|
1247 |
cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens
|
1248 |
output_smallest = 30 * 4
|
1249 |
prompt = prompt[-cutoff_len - output_smallest:]
|
|
|
27 |
human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
|
28 |
from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
|
29 |
|
30 |
+
is_hf = os.getenv("HUGGINGFACE_SPACES")
|
31 |
+
is_gpth2oai = os.getenv("GPT_H2O_AI")
|
32 |
+
is_public = is_hf or is_gpth2oai # multi-user case with fixed model and disclaimer
|
33 |
+
is_low_mem = is_hf # assumes run on 24GB consumer GPU
|
34 |
+
|
35 |
|
36 |
def main(
|
37 |
load_8bit: bool = False,
|
|
|
95 |
):
|
96 |
# allow set token directly
|
97 |
use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
|
98 |
+
|
99 |
+
if is_public:
|
100 |
+
temperature = 0.4
|
101 |
+
top_p = 0.85
|
102 |
+
top_k = 70
|
|
|
|
|
|
|
103 |
do_sample = True
|
104 |
+
if is_low_mem:
|
105 |
+
base_model = 'h2oai/h2ogpt-oasst1-512-12b'
|
106 |
+
load_8bit = True
|
107 |
+
else:
|
108 |
+
base_model = 'h2oai/h2ogpt-oasst1-512-20b'
|
109 |
+
if is_low_mem:
|
110 |
+
load_8bit = True
|
111 |
+
if is_hf:
|
112 |
+
# must override share if in spaces
|
113 |
+
share = False
|
114 |
|
115 |
# get defaults
|
116 |
model_lower = base_model.lower()
|
|
|
214 |
assert ex[1] in [None, ''] # should be no iinput
|
215 |
assert ex[2] in [None, ''] # should be no context
|
216 |
prompt = ex[0]
|
217 |
+
cutoff_len = 768 if is_low_mem else 2048
|
218 |
inputs = stokenizer(prompt, res,
|
219 |
return_tensors="pt",
|
220 |
truncation=True,
|
|
|
538 |
"""
|
539 |
else:
|
540 |
description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
|
541 |
+
if is_public:
|
542 |
description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
|
543 |
if kwargs['load_8bit']:
|
544 |
+
description += """<i><li> Model is loaded in 8-bit and with other limitations in order to fit on GPUs with lower amounts of VRAM, so UX can be worse than non-hosted version.</i></li>"""
|
545 |
+
description += """<i><li>Model loading and unloading disabled to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
|
546 |
|
547 |
if kwargs['verbose']:
|
548 |
task_info_md = f"""
|
|
|
629 |
{description}
|
630 |
{task_info_md}
|
631 |
""")
|
632 |
+
if is_hf:
|
633 |
gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
|
634 |
|
635 |
# go button visible if
|
|
|
697 |
value=kwargs['stream_output'])
|
698 |
prompt_type = gr.Dropdown(prompt_types_strings,
|
699 |
value=kwargs['prompt_type'], label="Prompt Type",
|
700 |
+
visible=not is_public)
|
701 |
temperature = gr.Slider(minimum=0, maximum=3,
|
702 |
value=kwargs['temperature'],
|
703 |
label="Temperature",
|
|
|
710 |
value=kwargs['top_k'], label="Top k",
|
711 |
info='Num. tokens to sample from'
|
712 |
)
|
713 |
+
max_beams = 8 if not is_low_mem else 2
|
714 |
num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
|
715 |
value=min(max_beams, kwargs['num_beams']), label="Beams",
|
716 |
info="Number of searches for optimal overall probability. "
|
717 |
"Uses more GPU memory/compute")
|
718 |
+
max_max_new_tokens = 2048 if not is_low_mem else kwargs['max_new_tokens']
|
719 |
max_new_tokens = gr.Slider(
|
720 |
minimum=1, maximum=max_max_new_tokens, step=1,
|
721 |
value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
|
|
|
726 |
)
|
727 |
early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
|
728 |
value=kwargs['early_stopping'])
|
729 |
+
max_max_time = 60 * 5 if not is_low_mem else 60
|
730 |
max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
|
731 |
value=min(max_max_time, kwargs['max_time']), label="Max. time",
|
732 |
info="Max. time to search optimal output.")
|
|
|
736 |
num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
|
737 |
value=kwargs['num_return_sequences'],
|
738 |
label="Number Returns", info="Must be <= num_beams",
|
739 |
+
visible=not is_public)
|
740 |
do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
|
741 |
value=kwargs['do_sample'])
|
742 |
if kwargs['chat']:
|
743 |
iinput = gr.Textbox(lines=4, label="Input",
|
744 |
placeholder=kwargs['placeholder_input'],
|
745 |
+
visible=not is_public)
|
746 |
# nominally empty for chat mode
|
747 |
context = gr.Textbox(lines=1, label="Context",
|
748 |
info="Ignored in chat mode.",
|
749 |
+
visible=not is_public)
|
750 |
|
751 |
with gr.TabItem("Models"):
|
752 |
with gr.Row():
|
|
|
756 |
model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
|
757 |
lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
|
758 |
with gr.Column(scale=1):
|
759 |
+
load_msg = "Load Model/LORA" if not is_public \
|
760 |
+
else "LOAD DISABLED FOR HOSTED DEMO"
|
761 |
load_model_button = gr.Button(load_msg)
|
762 |
model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
|
763 |
lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
|
|
|
823 |
len(history[-1]) >= 2:
|
824 |
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
|
825 |
|
826 |
+
max_length_tokenize = 512 if is_low_mem else 2048
|
827 |
cutoff_len = max_length_tokenize*4 # restrict deberta related to max for LLM
|
828 |
|
829 |
question = history[-1][0]
|
|
|
1037 |
outputs=[model_state, model_used, lora_used, prompt_type])
|
1038 |
prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
|
1039 |
chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
|
1040 |
+
if not is_public:
|
1041 |
load_model_event = load_model_button.click(**load_model_args) \
|
1042 |
.then(**prompt_update_args) \
|
1043 |
.then(**chatbot_update_args) \
|
|
|
1255 |
# RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
|
1256 |
# RuntimeError: expected scalar type Half but found Float
|
1257 |
# with - 256
|
1258 |
+
max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
|
1259 |
cutoff_len = max_length_tokenize * 4 # if reaches limit, then can't generate new tokens
|
1260 |
output_smallest = 30 * 4
|
1261 |
prompt = prompt[-cutoff_len - output_smallest:]
|