Spaces:

yahma
/

BLOOMZ_Compare

Build error

App Files Files Community

gururise commited on Feb 6, 2023

Commit

2812e92

1 Parent(s): a74cf5e

updates

Browse files

Files changed (1) hide show

app.py +99 -121

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import gradio as gr
-import threading
 import codecs
 from datetime import datetime
 from transformers import BloomTokenizerFast
 from petals.client import DistributedBloomForCausalLM
 import torch
@@ -11,63 +11,18 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.bfloat16
 MODEL_NAMES = ["bigscience/bloom-petals", "bigscience/bloomz-petals"]
-models = {MODEL_NAMES[0]:None,MODEL_NAMES[1]:None}
 output = {MODEL_NAMES[0]:"",MODEL_NAMES[1]:""}
-kill = threading.Event()
-def stop_threads():
-    global kill
-    print("Force stopping threads")
-    kill.set()
-def gen_thread(model_name, prompt, max_tokens, temperature, top_p, repetition_penalty, stop):
-    global output
-    if kill.is_set():
-        return
-    flag = False
-    token_cnt = 0
-    with models[model_name][1].inference_session(max_length=512) as sess:
-        print(f"Thread Start -> {threading.get_ident()}")
-        output[model_name] = ""
-        inputs = models[model_name][0](prompt, return_tensors="pt")["input_ids"].to(DEVICE)
-        n_input_tokens = inputs.shape[1]
-        done = False
-        while not done and not kill.is_set():
-            outputs = models[model_name][1].generate(
-                inputs,
-                max_new_tokens=1,
-                do_sample=True,
-                top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                session=sess
-            )
-            output[model_name] += models[model_name][0].decode(outputs[0, n_input_tokens:])
-            token_cnt += 1
-            print("\n["+ str(threading.get_ident()) + "]" + output[model_name], end="", flush=True)
-            for stop_word in stop:
-                stop_word = codecs.getdecoder("unicode_escape")(stop_word)[0]
-                if stop_word != '' and stop_word in output[model_name]:
-                    print(f"\nDONE (stop) -> {threading.get_ident()}")
-                    done = True
-            if flag or (token_cnt >= max_tokens):
-                print(f"\nDONE (max tokens) -> {threading.get_ident()}")
-                done = True
-            inputs = None  # Prefix is passed only for the 1st token of the bot's response
-            n_input_tokens = 0
-        print(f"\nThread End -> {threading.get_ident()}")
 def to_md(text):
     return text.replace("\n", "<br />")
-threads = list()
 def infer(
         prompt,
-        model_idx = ["BLOOM","BLOOMZ"],
         max_new_tokens=10,
         temperature=0.1,
         top_p=1.0,
@@ -76,24 +31,22 @@ def infer(
         num_completions=1,
         seed=42,
 ):
-    global threads
     global output
     global models
-    if len(model_idx) == 0:
-        return
-    kill.clear()
     print("Loading Models\n")
-    for idx in model_idx:
-        model_name = MODEL_NAMES[idx]
-        if models[model_name] == None:
-            print ("Initializing " + model_name)
-            tokenizer = BloomTokenizerFast.from_pretrained(model_name)
-            model = DistributedBloomForCausalLM.from_pretrained(model_name, torch_dtype=TORCH_DTYPE)
-            model = model.to(DEVICE)
-            models[model_name] = tokenizer, model
-            output[model_name] = ""
     max_new_tokens = int(max_new_tokens)
     temperature = float(temperature)
@@ -115,71 +68,96 @@ def infer(
     print(f"START -> ({datetime.now()})\n")
     print(f"PROMPT ({datetime.now()}):\n-------\n{prompt}\n")
-    for idx in model_idx:
-        model_name = MODEL_NAMES[idx]
-        x = threading.Thread(target=gen_thread, args=(model_name, prompt, max_new_tokens, temperature, top_p, repetition_penalty, stop))
-        threads.append(x)
-        x.start()
-    # Join Threads
-    for model_name, thread in enumerate(threads):
-        while thread.is_alive():
-            thread.join(timeout=0.2)
-            yield output[MODEL_NAMES[0]], output[MODEL_NAMES[1]]
 examples = [
     [
         # Question Answering
         '''Please answer the following question:
 Question: What is the capital of Germany?
-Answer:''',["BLOOM","BLOOMZ"] , 3, 0.2, 1.0, 1.0, "\\n,</s>", ["BLOOM","BLOOMZ"]],
     [
-        # Natural Language Interface
-        '''Given a pair of sentences, choose whether the two sentences agree (entailment)/disagree (contradiction) with each other.
-Possible labels: 1. entailment 2. contradiction
-Sentence 1: The skier was on the edge of the ramp. Sentence 2: The skier was dressed in winter clothes.
-Label: entailment
-Sentence 1: The boy skated down the staircase railing. Sentence 2: The boy is a newbie skater.
-Label: contradiction
-Sentence 1: Two middle-aged people stand by a golf hole. Sentence 2: A couple riding in a golf cart.
-Label:''',["BLOOM","BLOOMZ"] , 2, 0.2, 1.0, 1.0, "\\n,</s>"]
 ]
-def clear_prompt():
-    return "","",""
-with gr.Blocks() as demo:
-    gr.Markdown("# <p style='text-align: center;'>BLOOM vs BLOOMZ Comparison</p>")
-    gr.Markdown("")
-    gr.Markdown("Test Inference on the [BLOOM](https://huggingface.co/bigscience/bloom) and [BLOOMZ](https://huggingface.co/bigscience/bloomz) 176 Billion Parameter models using Petals.  \
-        Please consider contributing your unused GPU cycles to the [Petals Swarm](https://github.com/bigscience-workshop/petals) to speed up inference. <br />\n \
-        Due to heavy resource requirements of these large models, token generation can take upwards of 3-5 seconds per token. Try to keep Max Tokens to a minimum.")
-    gr.Markdown("")
-    gr.Markdown("Special thanks to [RFT Capital](https://www.rftcapital.com/) for supporting our experiments with compute time dontations.")
-    gr.Markdown("Type a Prompt and then click **Run** to see the output.")
-    with gr.Row():
-        with gr.Column():
-            prompt = gr.Textbox(lines=17,label="Prompt",placeholder="Enter Prompt", interactive=True)
-            with gr.Box():
-                chk_boxes = gr.CheckboxGroup(choices=["BLOOM","BLOOMZ"],value=["BLOOM","BLOOMZ"], type="index", label="Model")
-                #min_length = gr.Slider(minimum=0, maximum=256, value=1, label="Minimum Length") #min_length
-                max_tokens = gr.Slider(minimum=1, maximum=256, value=15, label="Max Tokens")  # max_tokens
-                temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.2, label="Temperature")  # temperature
-                top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.9, label="Top P")  # top_p
-                rep_penalty = gr.Slider(minimum=0.9, maximum=3.0, step=0.1, value=1.0, label="Repetition Penalty")  # repetition penalty
-                stop = gr.Textbox(lines=1, value="\\n,</s>", label="Stop Token") # stop
-        with gr.Column():
-            bloom_out = gr.Textbox(lines=7, label="BLOOM OUTPUT:")
-            bloomz_out = gr.Textbox(lines=7,label="BLOOMZ OUTPUT:")
-    with gr.Row():
-        btn_clear = gr.Button("Clear", variant="secondary")
-        btn_run = gr.Button("Run", variant="primary")
-        btn_stop = gr.Button("Stop", variant="stop")
-        click_run = btn_run.click(infer, inputs=[prompt, chk_boxes, max_tokens, temperature, top_p, rep_penalty, stop], outputs=[bloom_out,bloomz_out])
-        btn_clear.click(clear_prompt, outputs=[prompt, bloom_out, bloomz_out])
-        btn_stop.click(stop_threads,cancels=click_run)
-    gr.Examples(examples, inputs=[prompt, chk_boxes, max_tokens, temperature, top_p, rep_penalty, stop])
-demo.queue(concurrency_count=1)
-demo.launch()

 import gradio as gr
 import codecs
 from datetime import datetime
+import gc
 from transformers import BloomTokenizerFast
 from petals.client import DistributedBloomForCausalLM
 import torch
 TORCH_DTYPE = torch.bfloat16
 MODEL_NAMES = ["bigscience/bloom-petals", "bigscience/bloomz-petals"]
+models = {"model":None,"model_name":None}
 output = {MODEL_NAMES[0]:"",MODEL_NAMES[1]:""}
+print (DEVICE)
 def to_md(text):
     return text.replace("\n", "<br />")
 def infer(
         prompt,
+        model_idx = 0,
         max_new_tokens=10,
         temperature=0.1,
         top_p=1.0,
         num_completions=1,
         seed=42,
 ):
     global output
     global models
     print("Loading Models\n")
+    model_name = MODEL_NAMES[model_idx]
+    if (models["model_name"] == None or models["model_name"] != model_name):
+        models = {"model":None,"model_name":None}
+        gc.collect()
+        if (DEVICE == "cuda"):
+            torch.cuda.empty_cache()
+        tokenizer = BloomTokenizerFast.from_pretrained(model_name)
+        model = DistributedBloomForCausalLM.from_pretrained(model_name, torch_dtype=TORCH_DTYPE, request_timeout=300)
+        model = model.to(DEVICE)
+        models["model"] = tokenizer, model
+        models["model_name"] = model_name
+    output[model_name] = ""
     max_new_tokens = int(max_new_tokens)
     temperature = float(temperature)
     print(f"START -> ({datetime.now()})\n")
     print(f"PROMPT ({datetime.now()}):\n-------\n{prompt}\n")
+    flag = False
+    token_cnt = 0
+    with models["model"][1].inference_session(max_length=512) as sess:
+        print(f"Encode Input Prompt")
+        output[model_name] = ""
+        inputs = models["model"][0](prompt, return_tensors="pt")["input_ids"].to(DEVICE)
+        n_input_tokens = inputs.shape[1]
+        done = False
+        print(f"Start Inference ({sess})")
+        while not done:
+            outputs = models["model"][1].generate(
+                inputs,
+                max_new_tokens=1,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                session=sess
+            )
+            output[model_name] += models["model"][0].decode(outputs[0, n_input_tokens:])
+            token_cnt += 1
+            print("\n["+ str(model_name) + "]" + output[model_name], end="", flush=True)
+            yield output[model_name]
+            for stop_word in stop:
+                stop_word = codecs.getdecoder("unicode_escape")(stop_word)[0]
+                if stop_word != '' and stop_word in output[model_name]:
+                    print(f"\nDONE (stop)")
+                    done = True
+            if flag or (token_cnt >= max_new_tokens):
+                print(f"\nDONE (max tokens)")
+                done = True
+            inputs = None  # Prefix is passed only for the 1st token of the bot's response
+            n_input_tokens = 0
+        print(f"\nEnd")
+    yield output[model_name]
 examples = [
     [
         # Question Answering
         '''Please answer the following question:
 Question: What is the capital of Germany?
+Answer:''',"BLOOMZ" , 3, 0.2, 1.0, 1.0, "\\n,</s>", ["BLOOM","BLOOMZ"]],
     [
+        # Chatbot 1
+        '''This is a conversation between Alex (an AI based on the 2020 GPT-3 language model), and Fritz (an AI based on the 2021 Jurassic-1 language model). They are exploring each other's capabilities, and trying to ask interesting, complex, and 'ungoogleable' questions of one another, to test the limits of the AI...
+Alex: Good morning, Fritz!
+Fritz:''',"BLOOM" , 160, 0.85, 0.9, 1.0, "\\n\\n,</s>"],
+    [
+        # Chatbot 1
+        '''This is a conversation between Alex (an AI based on the 2020 GPT-3 language model), and Fritz (an AI based on the 2021 Jurassic-1 language model). They are exploring each other's capabilities, and trying to ask interesting, complex, and 'ungoogleable' questions of one another, to test the limits of the AI...
+Alex: Good morning, Fritz!
+Fritz:''',"BLOOMZ" , 160, 0.85, 0.9, 1.0, "\\n\\n,</s>"],
+    [
+        # Expert Answers
+        '''Expert Questions & Helpful Answers
+Ask Research Experts
+Question:
+Are humans good or bad?
+Full Answer:''',"BLOOM" , 120, 0.85, 0.9, 1.0, "</s>"],
+    [
+        # G
+        '''You are the writing assistant for Stephen King. You have worked in the fiction/horror genre for 30 years. You are a Pulitzer Prize-winning author, and now you are tasked with developing a skeletal outline for his newest novel, set to be completed in the spring of 2024. Create a title and brief description for the first 5 chapters of this work.\n\nTitle:''',"BLOOM" , 120, 0.85, 0.9, 1.0, "</s>"
+    ]
 ]
+iface = gr.Interface(
+    fn=infer,
+    allow_flagging="never",
+    inputs=[
+        gr.Textbox(lines=20,label="Input Prompt", max_lines=10),  # prompt
+        gr.Radio(["BLOOM","BLOOMZ"], value="BLOOM", type="index", label="Choose 176 billion parameter Model"),
+        gr.Slider(1, 256, value=15),  # max_tokens
+        gr.Slider(0.0, 1.0, value=0.2),  # temperature
+        gr.Slider(0.0, 1.0, value=0.9),  # top_p
+        gr.Slider(0.9, 3.0, value=1.0),  # repetition penalty
+        gr.Textbox(lines=1, value="\\n\\n,</s>") # stop
+    ],
+    outputs=gr.Textbox(lines=20, label="Generated Output:"),
+    examples=examples,
+    #cache_examples=True,
+    title="BLOOM vs BLOOMZ",
+    description='''<p>Compare outputs of the BLOOM and BLOOMZ 176 billion parameter models using the Petals network. <b>WARNING:</b> Initial inference may take a long time. Keep the input prompt to a minimum size to speed things up.<p>
+    <p>Please consider contributing your unused GPU cycles to the <a href='https://github.com/bigscience-workshop/petals#connect-your-gpu-and-increase-petals-capacity'>petals swarm</a> to help speed up inference. Check the <a href='http://health.petals.ml/'>Health</a> of the Petals Swarm.</p>
+    <p>Big thanks to <a href='https://www.rftcapital.com/'>RFT Capital</a> for providing initial compute resources.</p>'''
+)
+iface.queue(concurrency_count=2)
+iface.launch()