Spaces:

yahma
/

BLOOMZ_Compare

Build error

App Files Files Community

gururise commited on Feb 3, 2023

Commit

26fd787

1 Parent(s): 53c517b

update gradio interface for iterative outputs

Browse files

Files changed (1) hide show

app.py +99 -85

app.py CHANGED Viewed

@@ -1,46 +1,73 @@
 import gradio as gr
 import threading
 import codecs
-#from ast import literal_eval
 from datetime import datetime
-import os
-#os.environ['TRANSFORMERS_CACHE'] = '/data/.modelcache/huggingface/hub/'
-#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:516"
 from transformers import BloomTokenizerFast
 from petals.client import DistributedBloomForCausalLM
 import torch
-import gc
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.bfloat16
 MODEL_NAMES = ["bigscience/bloom-petals", "bigscience/bloomz-petals"]
-models = {}
-output = {}
-def gen_thread(model_name, inputs, max_new_tokens, min_length, temperature, top_p, repetition_penalty):
     global output
-    n_input_tokens = inputs.shape[1]
-    outputs = models[model_name][1].generate(inputs,
-                    max_new_tokens=max_new_tokens,
-                    min_length=min_length,
-                    do_sample=True,
-                    temperature=temperature,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty
-                    )
-    output[model_name] = models[model_name][0].decode(outputs[0, n_input_tokens:])
 def to_md(text):
-    # return text.replace("\n", "<br />")
     return text.replace("\n", "<br />")
 def infer(
         prompt,
-        min_length=2,
         max_new_tokens=10,
         temperature=0.1,
         top_p=1.0,
@@ -49,27 +76,32 @@ def infer(
         num_completions=1,
         seed=42,
 ):
-    #gc.collect()
-    #torch.cuda.empty_cache()
-    if not models:
-        for model_name in MODEL_NAMES:
             tokenizer = BloomTokenizerFast.from_pretrained(model_name)
             model = DistributedBloomForCausalLM.from_pretrained(model_name, torch_dtype=TORCH_DTYPE)
             model = model.to(DEVICE)
             models[model_name] = tokenizer, model
     max_new_tokens = int(max_new_tokens)
-    num_completions = int(num_completions)
     temperature = float(temperature)
     top_p = float(top_p)
-    stop = stop.split(";")
     repetition_penalty = float(repetition_penalty)
     seed = seed
     assert 1 <= max_new_tokens <= 384
-    assert 0 <= min_length <= max_new_tokens
     assert 1 <= num_completions <= 5
     assert 0.0 <= temperature <= 1.0
     assert 0.0 <= top_p <= 1.0
@@ -80,45 +112,19 @@ def infer(
     if prompt == "":
         prompt = " "
-    threads = list()
     print(f"START -> ({datetime.now()})\n")
     print(f"PROMPT ({datetime.now()}):\n-------\n{prompt}\n")
-    for model_name in MODEL_NAMES:
-        inputs = models[model_name][0](prompt, return_tensors="pt")["input_ids"].to(DEVICE)
-        x = threading.Thread(target=gen_thread, args=(model_name, inputs, max_new_tokens, min_length, temperature, top_p, repetition_penalty))
         threads.append(x)
         x.start()
-        #n_input_tokens = inputs.shape[1]
-        # outputs = models[model_name][1].generate(inputs,
-        #             max_new_tokens=max_new_tokens,
-        #             min_length=min_length,
-        #             do_sample=True,
-        #             temperature=temperature,
-        #             top_p=top_p,
-        #             repetition_penalty=repetition_penalty
-        #             )
-        #output[model_name] = models[model_name][0].decode(outputs[0, n_input_tokens:])
-        #output[model_name] = outputs[len(prompt):]
     # Join Threads
     for model_name, thread in enumerate(threads):
-        print(f"waiting on: {model_name}\n")
-        thread.join()
-        print(f"{model_name} thread done\n")
-    for model_name in MODEL_NAMES:
-        stop = codecs.getdecoder("unicode_escape")(stop[0])[0]
-        stop =  [x.strip(' ') for x in stop.split(',')]
-        for stop_word in stop:
-            if stop_word != '' and stop_word in output[model_name]:
-                output[model_name] = output[model_name][:output[model_name].find(stop_word)]
-        print(f"--- START: {model_name} --- \n{output[model_name]}\n--- END {model_name} ---\n\n")
-    print(f"DONE -> ({datetime.now()})\n")
-    return output[MODEL_NAMES[0]], output[MODEL_NAMES[1]]
 examples = [
@@ -126,7 +132,7 @@ examples = [
         # Question Answering
         '''Please answer the following question:
 Question: What is the capital of Germany?
-Answer:''', 1, 3, 0.2, 1.0, 1.0, "\\n,</s>"],
     [
         # Natural Language Interface
         '''Given a pair of sentences, choose whether the two sentences agree (entailment)/disagree (contradiction) with each other.
@@ -136,28 +142,36 @@ Label: entailment
 Sentence 1: The boy skated down the staircase railing. Sentence 2: The boy is a newbie skater.
 Label: contradiction
 Sentence 1: Two middle-aged people stand by a golf hole. Sentence 2: A couple riding in a golf cart.
-Label:''', 1, 2, 0.2, 1.0, 1.0, "\\n,</s>"]
 ]
-iface = gr.Interface(
-    fn=infer,
-    allow_flagging="never",
-    inputs=[
-        gr.Textbox(lines=20),  # prompt
-        gr.Slider(0, 256, value=1), #min_length
-        gr.Slider(1, 384, value=20),  # max_tokens
-        gr.Slider(0.0, 1.0, value=0.2),  # temperature
-        gr.Slider(0.0, 1.0, value=0.9),  # top_p
-        gr.Slider(0.9, 3.0, value=1.0),  # repetition penalty
-        gr.Textbox(lines=1, value="\\n,</s>") # stop
-    ],
-    outputs=[gr.Textbox(lines=7, label="BLOOM OUTPUT:"), gr.Textbox(lines=7,label="BLOOMZ OUTPUT:")],
-    examples=examples,
-    cache_examples=True,
-    title="BLOOM vs BLOOMZ",
-    description='''<p>Compare outputs of the BLOOM and BLOOMZ 176 billion parameter models using the [Petals](https://petals.ml/) network. WARNING: Inference may take a long time. Keep the max_tokens low to speed things up.<p>
-    <p>Please consider [joining](https://github.com/bigscience-workshop/petals) the Petals network to help speed up inference.</p><p>Big thanks to [RFTCapital](https://www.rftcapital.com) for providing initial compute resources.</p>'''
-).launch()

 import gradio as gr
 import threading
 import codecs
 from datetime import datetime
 from transformers import BloomTokenizerFast
 from petals.client import DistributedBloomForCausalLM
 import torch
+import time
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 TORCH_DTYPE = torch.bfloat16
 MODEL_NAMES = ["bigscience/bloom-petals", "bigscience/bloomz-petals"]
+models = {MODEL_NAMES[0]:None,MODEL_NAMES[1]:None}
+output = {MODEL_NAMES[0]:"",MODEL_NAMES[1]:""}
+kill = threading.Event()
+def stop_threads():
+    global kill
+    print("Force stopping threads")
+    kill.set()
+def gen_thread(model_name, prompt, max_tokens, temperature, top_p, repetition_penalty, stop):
     global output
+    if kill.is_set():
+        return
+    flag = False
+    token_cnt = 0
+    with models[model_name][1].inference_session(max_length=512) as sess:
+        print(f"Thread Start -> {threading.get_ident()}")
+        output[model_name] = ""
+        inputs = models[model_name][0](prompt, return_tensors="pt")["input_ids"].to(DEVICE)
+        n_input_tokens = inputs.shape[1]
+        done = False
+        while not done and not kill.is_set():
+            outputs = models[model_name][1].generate(
+                inputs,
+                max_new_tokens=1,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                session=sess
+            )
+            output[model_name] += models[model_name][0].decode(outputs[0, n_input_tokens:])
+            token_cnt += 1
+            print("\n["+ str(threading.get_ident()) + "]" + output[model_name], end="", flush=True)
+            for stop_word in stop:
+                stop_word = codecs.getdecoder("unicode_escape")(stop_word)[0]
+                if stop_word != '' and stop_word in output[model_name]:
+                    print(f"\nDONE (stop) -> {threading.get_ident()}")
+                    done = True
+            if flag or (token_cnt >= max_tokens):
+                print(f"\nDONE (max tokens) -> {threading.get_ident()}")
+                done = True
+            inputs = None  # Prefix is passed only for the 1st token of the bot's response
+            n_input_tokens = 0
+        print(f"\nThread End -> {threading.get_ident()}")
 def to_md(text):
     return text.replace("\n", "<br />")
+threads = list()
 def infer(
         prompt,
+        model_idx = ["BLOOM","BLOOMZ"],
         max_new_tokens=10,
         temperature=0.1,
         top_p=1.0,
         num_completions=1,
         seed=42,
 ):
+    global threads
+    global output
+    global models
+    if len(model_idx) == 0:
+        return
+    kill.clear()
+    print("Loading Models\n")
+    for idx in model_idx:
+        model_name = MODEL_NAMES[idx]
+        if models[model_name] == None:
             tokenizer = BloomTokenizerFast.from_pretrained(model_name)
             model = DistributedBloomForCausalLM.from_pretrained(model_name, torch_dtype=TORCH_DTYPE)
             model = model.to(DEVICE)
             models[model_name] = tokenizer, model
+            output[model_name] = ""
     max_new_tokens = int(max_new_tokens)
     temperature = float(temperature)
     top_p = float(top_p)
+    stop =  [x.strip(' ') for x in stop.split(',')]
     repetition_penalty = float(repetition_penalty)
     seed = seed
     assert 1 <= max_new_tokens <= 384
     assert 1 <= num_completions <= 5
     assert 0.0 <= temperature <= 1.0
     assert 0.0 <= top_p <= 1.0
     if prompt == "":
         prompt = " "
     print(f"START -> ({datetime.now()})\n")
     print(f"PROMPT ({datetime.now()}):\n-------\n{prompt}\n")
+    for idx in model_idx:
+        model_name = MODEL_NAMES[idx]
+        x = threading.Thread(target=gen_thread, args=(model_name, prompt, max_new_tokens, temperature, top_p, repetition_penalty, stop))
         threads.append(x)
         x.start()
     # Join Threads
     for model_name, thread in enumerate(threads):
+        while thread.is_alive():
+            thread.join(timeout=0.2)
+            yield output[MODEL_NAMES[0]], output[MODEL_NAMES[1]]
 examples = [
         # Question Answering
         '''Please answer the following question:
 Question: What is the capital of Germany?
+Answer:''',["BLOOM","BLOOMZ"] , 3, 0.2, 1.0, 1.0, "\\n,</s>", ["BLOOM","BLOOMZ"]],
     [
         # Natural Language Interface
         '''Given a pair of sentences, choose whether the two sentences agree (entailment)/disagree (contradiction) with each other.
 Sentence 1: The boy skated down the staircase railing. Sentence 2: The boy is a newbie skater.
 Label: contradiction
 Sentence 1: Two middle-aged people stand by a golf hole. Sentence 2: A couple riding in a golf cart.
+Label:''',["BLOOM","BLOOMZ"] , 2, 0.2, 1.0, 1.0, "\\n,</s>"]
 ]
+def clear_prompt():
+    return "","",""
+with gr.Blocks() as demo:
+    gr.Markdown("Start typing below and then click **Run** to see the output.")
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(lines=17,label="Prompt",placeholder="Enter Prompt", interactive=True)
+            with gr.Box():
+                chk_boxes = gr.CheckboxGroup(choices=["BLOOM","BLOOMZ"],value=["BLOOM","BLOOMZ"], type="index", label="Model")
+                #min_length = gr.Slider(minimum=0, maximum=256, value=1, label="Minimum Length") #min_length
+                max_tokens = gr.Slider(minimum=1, maximum=256, value=15, label="Max Tokens")  # max_tokens
+                temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.2, label="Temperature")  # temperature
+                top_p = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.9, label="Top P")  # top_p
+                rep_penalty = gr.Slider(minimum=0.9, maximum=3.0, step=0.1, value=1.0, label="Repetition Penalty")  # repetition penalty
+                stop = gr.Textbox(lines=1, value="\\n,</s>", label="Stop Token") # stop
+        with gr.Column():
+            bloom_out = gr.Textbox(lines=7, label="BLOOM OUTPUT:")
+            bloomz_out = gr.Textbox(lines=7,label="BLOOMZ OUTPUT:")
+    with gr.Row():
+        btn_clear = gr.Button("Clear", variant="secondary")
+        btn_run = gr.Button("Run", variant="primary")
+        btn_stop = gr.Button("Stop", variant="stop")
+        click_run = btn_run.click(infer, inputs=[prompt, chk_boxes, max_tokens, temperature, top_p, rep_penalty, stop], outputs=[bloom_out,bloomz_out])
+        btn_clear.click(clear_prompt, outputs=[prompt, bloom_out, bloomz_out])
+        btn_stop.click(stop_threads,cancels=click_run)
+    gr.Examples(examples, inputs=[prompt, chk_boxes, max_tokens, temperature, top_p, rep_penalty, stop])
+demo.queue(concurrency_count=3)
+demo.launch()