MyNameIsSimon commited on
Commit
ae519a4
·
1 Parent(s): df728ac

attempt to use unsloth

Browse files
Files changed (1) hide show
  1. app.py +79 -36
app.py CHANGED
@@ -1,57 +1,100 @@
 
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
 
 
 
 
3
 
4
  """
5
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
  """
7
 
8
- client = InferenceClient()
9
-
10
-
11
- def respond(
12
- message,
13
- history: list[tuple[str, str]],
14
- model,
15
- system_message,
16
- max_tokens,
17
- temperature,
18
- top_p,
19
- ):
20
- messages = [{"role": "system", "content": system_message}]
21
-
22
- for val in history:
23
- if val[0]:
24
- messages.append({"role": "user", "content": val[0]})
25
- if val[1]:
26
- messages.append({"role": "assistant", "content": val[1]})
27
-
28
- messages.append({"role": "user", "content": message})
29
-
30
- response = ""
31
-
32
- for message in client.chat_completion(
33
- messages,
34
- max_tokens=max_tokens,
35
- stream=True,
36
- temperature=temperature,
37
- top_p=top_p,
38
- model=model,
39
  ):
40
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- response += token
43
- yield response
44
 
45
 
46
  """
47
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
48
  """
 
49
  model_choices = [
50
  "lab2-as/lora_model_gguf",
51
  "lab2-as/lora_model",
52
  ]
53
  demo = gr.ChatInterface(
54
- respond,
55
  additional_inputs=[
56
  gr.Dropdown(choices=model_choices, value=model_choices[0], label="Select Model"),
57
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
@@ -62,7 +105,7 @@ demo = gr.ChatInterface(
62
  maximum=1.0,
63
  value=0.95,
64
  step=0.05,
65
- label="Top-p (nucleus sampling)",
66
  ),
67
  ],
68
  )
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "0"
3
+
4
  import gradio as gr
5
  from huggingface_hub import InferenceClient
6
+ from torch.cuda import is_available
7
+
8
+ from unsloth import FastLanguageModel
9
+ from transformers import TextIteratorStreamer
10
+ from threading import Thread
11
 
12
  """
13
  For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
14
  """
15
 
16
+ # client = InferenceClient()
17
+ class MyModel:
18
+ def __init__(self):
19
+ self.client = None
20
+ self.current_model = ""
21
+ self.tokenizer = None
22
+
23
+ def respond(
24
+ self,
25
+ message,
26
+ history: list[tuple[str, str]],
27
+ model,
28
+ system_message,
29
+ max_tokens,
30
+ temperature,
31
+ min_p,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  ):
33
+ if model != self.current_model or self.current_model is None:
34
+ client, tokenizer = FastLanguageModel.from_pretrained(
35
+ model_name = model,
36
+ max_seq_length = 2048,
37
+ dtype = None,
38
+ load_in_4bit = True,
39
+ )
40
+ FastLanguageModel.for_inference(client) # Enable native 2x faster inference
41
+ self.client = client
42
+ self.tokenizer = tokenizer
43
+ self.current_model = model
44
+
45
+ text_streamer = TextIteratorStreamer(self.tokenizer, skip_prompt = True)
46
+
47
+ messages = [{"role": "system", "content": system_message}]
48
+
49
+ for val in history:
50
+ if val[0]:
51
+ messages.append({"role": "user", "content": val[0]})
52
+ if val[1]:
53
+ messages.append({"role": "assistant", "content": val[1]})
54
+
55
+ messages.append({"role": "user", "content": message})
56
+
57
+ inputs = self.tokenizer.apply_chat_template(
58
+ messages,
59
+ tokenize = True,
60
+ add_generation_prompt = True, # Must add for generation
61
+ return_tensors = "pt",
62
+ ).to("cuda" if is_available() else "cpu")
63
+
64
+ generation_kwargs = dict(input_ids=inputs, streamer=text_streamer, max_new_tokens=max_tokens, use_cache=True, temperature=temperature, min_p=min_p)
65
+ thread = Thread(target=self.client.generate, kwargs=generation_kwargs)
66
+ thread.start()
67
+
68
+ response = ""
69
+
70
+ for new_text in text_streamer:
71
+ response += new_text
72
+ yield response.strip("<|eot_id|>")
73
+
74
+ # for message in client.chat_completion(
75
+ # messages,
76
+ # max_tokens=max_tokens,
77
+ # stream=True,
78
+ # temperature=temperature,
79
+ # top_p=top_p,
80
+ # model=model,
81
+ # ):
82
+ # token = message.choices[0].delta.content
83
 
84
+ # response += token
85
+ # yield response
86
 
87
 
88
  """
89
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
90
  """
91
+ my_model = MyModel()
92
  model_choices = [
93
  "lab2-as/lora_model_gguf",
94
  "lab2-as/lora_model",
95
  ]
96
  demo = gr.ChatInterface(
97
+ my_model.respond,
98
  additional_inputs=[
99
  gr.Dropdown(choices=model_choices, value=model_choices[0], label="Select Model"),
100
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
 
105
  maximum=1.0,
106
  value=0.95,
107
  step=0.05,
108
+ label="Min-p (nucleus sampling)",
109
  ),
110
  ],
111
  )