CaioXapelaum commited on
Commit
5e23183
1 Parent(s): 3462a26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -182
app.py CHANGED
@@ -1,182 +1,190 @@
1
- import spaces
2
- import subprocess
3
- from llama_cpp import Llama
4
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
5
- from llama_cpp_agent.providers import LlamaCppPythonProvider
6
- from llama_cpp_agent.chat_history import BasicChatHistory
7
- from llama_cpp_agent.chat_history.messages import Roles
8
- import gradio as gr
9
- from huggingface_hub import hf_hub_download
10
- import os
11
- import cv2
12
-
13
- huggingface_token = os.environ.get('HF_TOKEN')
14
-
15
- # Download the Meta-Llama-3.1-8B-Instruct model
16
- hf_hub_download(
17
- repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
18
- filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
19
- local_dir="./models",
20
- token=huggingface_token
21
- )
22
-
23
- hf_hub_download(
24
- repo_id="bartowski/Mistral-Nemo-Instruct-2407-GGUF",
25
- filename="Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
26
- local_dir="./models",
27
- token=huggingface_token
28
- )
29
-
30
- hf_hub_download(
31
- repo_id="bartowski/gemma-2-2b-it-GGUF",
32
- filename="gemma-2-2b-it-Q6_K_L.gguf",
33
- local_dir="./models",
34
- token=huggingface_token
35
- )
36
-
37
- hf_hub_download(
38
- repo_id="bartowski/openchat-3.6-8b-20240522-GGUF",
39
- filename="openchat-3.6-8b-20240522-Q6_K.gguf",
40
- local_dir="./models",
41
- token=huggingface_token
42
- )
43
-
44
- hf_hub_download(
45
- repo_id="bartowski/Llama-3-Groq-8B-Tool-Use-GGUF",
46
- filename="Llama-3-Groq-8B-Tool-Use-Q6_K.gguf",
47
- local_dir="./models",
48
- token=huggingface_token
49
- )
50
-
51
-
52
- llm = None
53
- llm_model = None
54
-
55
- cv2.setNumThreads(1)
56
-
57
- @spaces.GPU()
58
- def respond(
59
- message,
60
- history: list[tuple[str, str]],
61
- model,
62
- system_message,
63
- max_tokens,
64
- temperature,
65
- top_p,
66
- top_k,
67
- repeat_penalty,
68
- ):
69
- chat_template = MessagesFormatterType.GEMMA_2
70
-
71
- global llm
72
- global llm_model
73
-
74
- # Load model only if it's not already loaded or if a new model is selected
75
- if llm is None or llm_model != model:
76
- try:
77
- llm = Llama(
78
- model_path=f"models/{model}",
79
- flash_attn=True,
80
- n_gpu_layers=81, # Adjust based on available GPU resources
81
- n_batch=1024,
82
- n_ctx=8192,
83
- )
84
- llm_model = model
85
- except Exception as e:
86
- return f"Error loading model: {str(e)}"
87
-
88
- provider = LlamaCppPythonProvider(llm)
89
-
90
- agent = LlamaCppAgent(
91
- provider,
92
- system_prompt=f"{system_message}",
93
- predefined_messages_formatter_type=chat_template,
94
- debug_output=True
95
- )
96
-
97
- settings = provider.get_provider_default_settings()
98
- settings.temperature = temperature
99
- settings.top_k = top_k
100
- settings.top_p = top_p
101
- settings.max_tokens = max_tokens
102
- settings.repeat_penalty = repeat_penalty
103
- settings.stream = True
104
-
105
- messages = BasicChatHistory()
106
-
107
- # Add user and assistant messages to the history
108
- for msn in history:
109
- user = {'role': Roles.user, 'content': msn[0]}
110
- assistant = {'role': Roles.assistant, 'content': msn[1]}
111
- messages.add_message(user)
112
- messages.add_message(assistant)
113
-
114
- # Stream the response
115
- try:
116
- stream = agent.get_chat_response(
117
- message,
118
- llm_sampling_settings=settings,
119
- chat_history=messages,
120
- returns_streaming_generator=True,
121
- print_output=False
122
- )
123
-
124
- outputs = ""
125
- for output in stream:
126
- outputs += output
127
- yield outputs
128
- except Exception as e:
129
- yield f"Error during response generation: {str(e)}"
130
-
131
- demo = gr.ChatInterface(
132
- fn=respond,
133
- additional_inputs=[
134
- gr.Dropdown([
135
- 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
136
- 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf',
137
- 'gemma-2-2b-it-Q6_K_L.gguf',
138
- 'openchat-3.6-8b-20240522-Q6_K.gguf',
139
- 'Llama-3-Groq-8B-Tool-Use-Q6_K.gguf'
140
- ],
141
- value="gemma-2-2b-it-Q6_K_L.gguf",
142
- label="Model"
143
- ),
144
- gr.Textbox(value="You are a helpful assistant.", label="System message"),
145
- gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
146
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
147
- gr.Slider(
148
- minimum=0.1,
149
- maximum=1.0,
150
- value=0.95,
151
- step=0.05,
152
- label="Top-p",
153
- ),
154
- gr.Slider(
155
- minimum=0,
156
- maximum=100,
157
- value=40,
158
- step=1,
159
- label="Top-k",
160
- ),
161
- gr.Slider(
162
- minimum=0.0,
163
- maximum=2.0,
164
- value=1.1,
165
- step=0.1,
166
- label="Repetition penalty",
167
- ),
168
- ],
169
- retry_btn="Retry",
170
- undo_btn="Undo",
171
- clear_btn="Clear",
172
- submit_btn="Send",
173
- title="Chat with lots of Models and LLMs using llama.cpp",
174
- chatbot=gr.Chatbot(
175
- scale=1,
176
- likeable=False,
177
- show_copy_button=True
178
- )
179
- )
180
-
181
- if __name__ == "__main__":
182
- demo.launch()
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import subprocess
3
+ from llama_cpp import Llama
4
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
5
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
6
+ from llama_cpp_agent.chat_history import BasicChatHistory
7
+ from llama_cpp_agent.chat_history.messages import Roles
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import os
11
+ import cv2
12
+
13
+ huggingface_token = os.environ.get('HF_TOKEN')
14
+
15
+ # Download the Meta-Llama-3.1-8B-Instruct model
16
+ hf_hub_download(
17
+ repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
18
+ filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
19
+ local_dir="./models",
20
+ token=huggingface_token
21
+ )
22
+
23
+ hf_hub_download(
24
+ repo_id="bartowski/Mistral-Nemo-Instruct-2407-GGUF",
25
+ filename="Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
26
+ local_dir="./models",
27
+ token=huggingface_token
28
+ )
29
+
30
+ hf_hub_download(
31
+ repo_id="bartowski/gemma-2-2b-it-GGUF",
32
+ filename="gemma-2-2b-it-Q6_K_L.gguf",
33
+ local_dir="./models",
34
+ token=huggingface_token
35
+ )
36
+
37
+ hf_hub_download(
38
+ repo_id="bartowski/openchat-3.6-8b-20240522-GGUF",
39
+ filename="openchat-3.6-8b-20240522-Q6_K.gguf",
40
+ local_dir="./models",
41
+ token=huggingface_token
42
+ )
43
+
44
+ hf_hub_download(
45
+ repo_id="bartowski/Llama-3-Groq-8B-Tool-Use-GGUF",
46
+ filename="Llama-3-Groq-8B-Tool-Use-Q6_K.gguf",
47
+ local_dir="./models",
48
+ token=huggingface_token
49
+ )
50
+
51
+ hf_hub_download(
52
+ repo_id="bartowski/MiniCPM-V-2_6-GGUF",
53
+ filename="MiniCPM-V-2_6-Q6_K.gguf",
54
+ local_dir="./models",
55
+ token=huggingface_token
56
+ )
57
+
58
+
59
+ llm = None
60
+ llm_model = None
61
+
62
+ cv2.setNumThreads(1)
63
+
64
+ @spaces.GPU()
65
+ def respond(
66
+ message,
67
+ history: list[tuple[str, str]],
68
+ model,
69
+ system_message,
70
+ max_tokens,
71
+ temperature,
72
+ top_p,
73
+ top_k,
74
+ repeat_penalty,
75
+ ):
76
+ chat_template = MessagesFormatterType.GEMMA_2
77
+
78
+ global llm
79
+ global llm_model
80
+
81
+ # Load model only if it's not already loaded or if a new model is selected
82
+ if llm is None or llm_model != model:
83
+ try:
84
+ llm = Llama(
85
+ model_path=f"models/{model}",
86
+ flash_attn=True,
87
+ n_gpu_layers=81, # Adjust based on available GPU resources
88
+ n_batch=1024,
89
+ n_ctx=8192,
90
+ )
91
+ llm_model = model
92
+ except Exception as e:
93
+ return f"Error loading model: {str(e)}"
94
+
95
+ provider = LlamaCppPythonProvider(llm)
96
+
97
+ agent = LlamaCppAgent(
98
+ provider,
99
+ system_prompt=f"{system_message}",
100
+ predefined_messages_formatter_type=chat_template,
101
+ debug_output=True
102
+ )
103
+
104
+ settings = provider.get_provider_default_settings()
105
+ settings.temperature = temperature
106
+ settings.top_k = top_k
107
+ settings.top_p = top_p
108
+ settings.max_tokens = max_tokens
109
+ settings.repeat_penalty = repeat_penalty
110
+ settings.stream = True
111
+
112
+ messages = BasicChatHistory()
113
+
114
+ # Add user and assistant messages to the history
115
+ for msn in history:
116
+ user = {'role': Roles.user, 'content': msn[0]}
117
+ assistant = {'role': Roles.assistant, 'content': msn[1]}
118
+ messages.add_message(user)
119
+ messages.add_message(assistant)
120
+
121
+ # Stream the response
122
+ try:
123
+ stream = agent.get_chat_response(
124
+ message,
125
+ llm_sampling_settings=settings,
126
+ chat_history=messages,
127
+ returns_streaming_generator=True,
128
+ print_output=False
129
+ )
130
+
131
+ outputs = ""
132
+ for output in stream:
133
+ outputs += output
134
+ yield outputs
135
+ except Exception as e:
136
+ yield f"Error during response generation: {str(e)}"
137
+
138
+ demo = gr.ChatInterface(
139
+ fn=respond,
140
+ additional_inputs=[
141
+ gr.Dropdown([
142
+ 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
143
+ 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf',
144
+ 'gemma-2-2b-it-Q6_K_L.gguf',
145
+ 'openchat-3.6-8b-20240522-Q6_K.gguf',
146
+ 'Llama-3-Groq-8B-Tool-Use-Q6_K.gguf',
147
+ 'MiniCPM-V-2_6-Q6_K.gguf'
148
+ ],
149
+ value="gemma-2-2b-it-Q6_K_L.gguf",
150
+ label="Model"
151
+ ),
152
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
153
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
154
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
155
+ gr.Slider(
156
+ minimum=0.1,
157
+ maximum=1.0,
158
+ value=0.95,
159
+ step=0.05,
160
+ label="Top-p",
161
+ ),
162
+ gr.Slider(
163
+ minimum=0,
164
+ maximum=100,
165
+ value=40,
166
+ step=1,
167
+ label="Top-k",
168
+ ),
169
+ gr.Slider(
170
+ minimum=0.0,
171
+ maximum=2.0,
172
+ value=1.1,
173
+ step=0.1,
174
+ label="Repetition penalty",
175
+ ),
176
+ ],
177
+ retry_btn="Retry",
178
+ undo_btn="Undo",
179
+ clear_btn="Clear",
180
+ submit_btn="Send",
181
+ title="Chat with lots of Models and LLMs using llama.cpp",
182
+ chatbot=gr.Chatbot(
183
+ scale=1,
184
+ likeable=False,
185
+ show_copy_button=True
186
+ )
187
+ )
188
+
189
+ if __name__ == "__main__":
190
+ demo.launch()