nroggendorff commited on
Commit
018a68d
·
verified ·
1 Parent(s): 13f35b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -11
app.py CHANGED
@@ -1,20 +1,37 @@
1
- import spaces
2
  import gradio as gr
 
 
 
3
  import torch
4
- from transformers import pipeline
5
 
6
  torch.set_default_device("cuda")
7
 
8
- pipe = pipeline("text-generation", model="cognitivecomputations/dolphin-2.9.1-mixtral-1x22b")
 
 
 
 
 
 
 
 
 
 
9
 
10
  @spaces.GPU(duration=120)
11
- def predict(message, history):
12
- conv = [{"role": "system", "content": "You are Dolphin, a helpful AI assistant."}]
13
  for item in history:
14
- conv.append({"role": "user", "content": item[0]})
15
- conv.append({"role": "assistant", "content": item[1]})
16
- conv.append({"role": "user", "content": message})
17
- generated_text = pipe(conv, max_new_tokens=1024)[0]['generated_text'][-1]['content']
18
- return generated_text
 
 
 
 
 
 
19
 
20
- gr.ChatInterface(predict).launch()
 
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
+ import torch
4
+ import spaces
5
  import torch
 
6
 
7
  torch.set_default_device("cuda")
8
 
9
+ bnb_config = BitsAndBytesConfig(
10
+ load_in_4bit=True,
11
+ bnb_4bit_use_double_quant=True,
12
+ bnb_4bit_quant_type="nf4",
13
+ bnb_4bit_compute_dtype=torch.bfloat16
14
+ )
15
+
16
+ model_id = "cognitivecomputations/dolphin-2.9.1-mixtral-1x22b"
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
19
+ model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
20
 
21
  @spaces.GPU(duration=120)
22
+ def predict(input_text, history):
23
+ chat = []
24
  for item in history:
25
+ chat.append({"role": "user", "content": item[0]})
26
+ if item[1] is not None:
27
+ chat.append({"role": "assistant", "content": item[1]})
28
+ chat.append({"role": "user", "content": input_text})
29
+ conv = tokenizer.apply_chat_template(chat, tokenize=False)
30
+
31
+ inputs = tokenizer(conv, return_tensors="pt").to("cuda")
32
+ outputs = model.generate(**inputs, max_new_tokens=512)
33
+
34
+ generated_text = tokenizer.batch_decode(outputs)[0]
35
+ return generated_text#.split("<|assistant|>")[-1]
36
 
37
+ gr.ChatInterface(predict, theme="soft").launch()