Spaces:

Azure99
/

Blossom-V6.1-32B-AWQ-Demo

Sleeping

App Files Files Community

Azure99 commited on Jan 29

Commit

95caa38

verified ·

1 Parent(s): 7cc222c

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -44

app.py CHANGED Viewed

@@ -1,61 +1,38 @@
 import gradio as gr
 import spaces
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
-from transformers import AutoTokenizer
-MAX_INPUT_LIMIT = 3584
-MAX_NEW_TOKENS = 1536
-MODEL_HF = "Azure99/blossom-v5.1-34b"
-MODEL_REPO = "Azure99/blossom-v5.1-34b-gguf"
-MODEL_FILE = "model-q6_k.gguf"
-MODEL_LOCAL_DIR = "./"
-hf_hub_download(
-    repo_id=MODEL_REPO,
-    filename=MODEL_FILE,
-    local_dir=MODEL_LOCAL_DIR
-)
-llm: Llama = None
-tokenizer = AutoTokenizer.from_pretrained(MODEL_HF)
 def get_input_ids(inst, history):
-    prefix = ("A chat between a human and an artificial intelligence bot. "
-              "The bot gives helpful, detailed, and polite answers to the human's questions.")
-    patterns = []
-    for conv in history:
-        patterns.append(f'\n|Human|: {conv[0]}\n|Bot|: ')
-        patterns.append(f'{conv[1]}')
-    patterns.append(f'\n|Human|: {inst}\n|Bot|: ')
-    patterns[0] = prefix + patterns[0]
-    input_ids = []
-    for i, pattern in enumerate(patterns):
-        input_ids += tokenizer.encode(pattern, add_special_tokens=(i == 0))
-        if i % 2 == 1:
-            input_ids += [tokenizer.eos_token_id]
-    return input_ids
 @spaces.GPU
 def chat(inst, history, temperature, top_p, repetition_penalty):
-    global llm
-    if llm is None:
-        llm = Llama(model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, offload_kqv=True, n_ctx=4096)
     input_ids = get_input_ids(inst, history)
-    if len(input_ids) > MAX_INPUT_LIMIT:
-        yield "The input is too long, please clear the history."
-        return
-    generate_config = dict(temperature=temperature, top_p=top_p, repeat_penalty=repetition_penalty,
-                           top_k=50, stream=True, max_tokens=1024)
     outputs = ""
-    for chunk in llm(input_ids, **generate_config):
-        outputs += chunk["choices"][0]["text"]
         yield outputs
@@ -92,7 +69,7 @@ additional_inputs = [
 gr.ChatInterface(chat,
                  chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True),
                  textbox=gr.Textbox(placeholder="", container=False, scale=7),
-                 title="Blossom 34B Demo",
                  description='Hello, I am Blossom, an open source conversational large language model.🌠'
                              '<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
                  theme="soft",

+from threading import Thread
 import gradio as gr
 import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+MAX_NEW_TOKENS = 2048
+MODEL_NAME = "Azure99/Blossom-V6-32B-AWQ"
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 def get_input_ids(inst, history):
+    conversation = []
+    for user, assistant in history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": inst})
+    return tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
 @spaces.GPU
 def chat(inst, history, temperature, top_p, repetition_penalty):
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     input_ids = get_input_ids(inst, history)
+    generation_kwargs = dict(input_ids=input_ids,
+                             streamer=streamer, do_sample=True, max_new_tokens=MAX_NEW_TOKENS,
+                             temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)
+    Thread(target=model.generate, kwargs=generation_kwargs).start()
     outputs = ""
+    for new_text in streamer:
+        outputs += new_text
         yield outputs
 gr.ChatInterface(chat,
                  chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True),
                  textbox=gr.Textbox(placeholder="", container=False, scale=7),
+                 title="Blossom 14B Demo",
                  description='Hello, I am Blossom, an open source conversational large language model.🌠'
                              '<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
                  theme="soft",