Azure99 commited on
Commit
95caa38
·
verified ·
1 Parent(s): 7cc222c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -44
app.py CHANGED
@@ -1,61 +1,38 @@
 
 
1
  import gradio as gr
2
  import spaces
3
- from huggingface_hub import hf_hub_download
4
- from llama_cpp import Llama
5
- from transformers import AutoTokenizer
6
-
7
- MAX_INPUT_LIMIT = 3584
8
- MAX_NEW_TOKENS = 1536
9
- MODEL_HF = "Azure99/blossom-v5.1-34b"
10
- MODEL_REPO = "Azure99/blossom-v5.1-34b-gguf"
11
- MODEL_FILE = "model-q6_k.gguf"
12
- MODEL_LOCAL_DIR = "./"
13
 
14
- hf_hub_download(
15
- repo_id=MODEL_REPO,
16
- filename=MODEL_FILE,
17
- local_dir=MODEL_LOCAL_DIR
18
- )
19
 
20
- llm: Llama = None
21
- tokenizer = AutoTokenizer.from_pretrained(MODEL_HF)
22
 
23
 
24
  def get_input_ids(inst, history):
25
- prefix = ("A chat between a human and an artificial intelligence bot. "
26
- "The bot gives helpful, detailed, and polite answers to the human's questions.")
27
- patterns = []
28
- for conv in history:
29
- patterns.append(f'\n|Human|: {conv[0]}\n|Bot|: ')
30
- patterns.append(f'{conv[1]}')
31
- patterns.append(f'\n|Human|: {inst}\n|Bot|: ')
32
- patterns[0] = prefix + patterns[0]
33
-
34
- input_ids = []
35
- for i, pattern in enumerate(patterns):
36
- input_ids += tokenizer.encode(pattern, add_special_tokens=(i == 0))
37
- if i % 2 == 1:
38
- input_ids += [tokenizer.eos_token_id]
39
- return input_ids
40
 
41
 
42
  @spaces.GPU
43
  def chat(inst, history, temperature, top_p, repetition_penalty):
44
- global llm
45
- if llm is None:
46
- llm = Llama(model_path=MODEL_FILE, n_gpu_layers=-1, flash_attn=True, offload_kqv=True, n_ctx=4096)
47
-
48
  input_ids = get_input_ids(inst, history)
49
- if len(input_ids) > MAX_INPUT_LIMIT:
50
- yield "The input is too long, please clear the history."
51
- return
52
 
53
- generate_config = dict(temperature=temperature, top_p=top_p, repeat_penalty=repetition_penalty,
54
- top_k=50, stream=True, max_tokens=1024)
55
 
56
  outputs = ""
57
- for chunk in llm(input_ids, **generate_config):
58
- outputs += chunk["choices"][0]["text"]
59
  yield outputs
60
 
61
 
@@ -92,7 +69,7 @@ additional_inputs = [
92
  gr.ChatInterface(chat,
93
  chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True),
94
  textbox=gr.Textbox(placeholder="", container=False, scale=7),
95
- title="Blossom 34B Demo",
96
  description='Hello, I am Blossom, an open source conversational large language model.🌠'
97
  '<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
98
  theme="soft",
 
1
+ from threading import Thread
2
+
3
  import gradio as gr
4
  import spaces
5
+ import torch
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
 
 
7
 
8
+ MAX_NEW_TOKENS = 2048
9
+ MODEL_NAME = "Azure99/Blossom-V6-32B-AWQ"
 
 
 
10
 
11
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.bfloat16, device_map="auto")
12
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
13
 
14
 
15
  def get_input_ids(inst, history):
16
+ conversation = []
17
+ for user, assistant in history:
18
+ conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
19
+ conversation.append({"role": "user", "content": inst})
20
+ return tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
 
 
21
 
22
 
23
  @spaces.GPU
24
  def chat(inst, history, temperature, top_p, repetition_penalty):
25
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
 
 
26
  input_ids = get_input_ids(inst, history)
27
+ generation_kwargs = dict(input_ids=input_ids,
28
+ streamer=streamer, do_sample=True, max_new_tokens=MAX_NEW_TOKENS,
29
+ temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty)
30
 
31
+ Thread(target=model.generate, kwargs=generation_kwargs).start()
 
32
 
33
  outputs = ""
34
+ for new_text in streamer:
35
+ outputs += new_text
36
  yield outputs
37
 
38
 
 
69
  gr.ChatInterface(chat,
70
  chatbot=gr.Chatbot(show_label=False, height=500, show_copy_button=True, render_markdown=True),
71
  textbox=gr.Textbox(placeholder="", container=False, scale=7),
72
+ title="Blossom 14B Demo",
73
  description='Hello, I am Blossom, an open source conversational large language model.🌠'
74
  '<a href="https://github.com/Azure99/BlossomLM">GitHub</a>',
75
  theme="soft",