ruggsea commited on
Commit
38403dc
·
1 Parent(s): f97c568
Files changed (1) hide show
  1. app.py +99 -160
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
 
2
  from threading import Thread
3
- from typing import Iterator
4
 
5
  import gradio as gr
6
  import spaces
@@ -12,6 +12,8 @@ DEFAULT_MAX_NEW_TOKENS = 4000
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
 
14
  DESCRIPTION = """\
 
 
15
  This Space showcases the Llama3.1-Instruct-SEP-Chat model from ruggsea, a fine-tuned instruction version of Meta's Llama 3.1 8B model, specifically tailored for philosophical discussions with a formal and informative tone. The model was trained using the Stanford Encyclopedia of Philosophy dataset and carefully crafted prompts.
16
 
17
  Feel free to engage in philosophical discussions and ask questions. The model supports multi-turn conversations and will maintain context.
@@ -24,7 +26,6 @@ LICENSE = """
24
  As a derivative work of Llama 3.1, this demo is governed by the original Meta license and acceptable use policy.
25
  """
26
 
27
- # Initialize model and tokenizer
28
  if torch.cuda.is_available():
29
  model_id = "ruggsea/Llama3.1-Instruct-SEP-Chat"
30
  model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
@@ -32,174 +33,112 @@ if torch.cuda.is_available():
32
  tokenizer.use_default_system_prompt = False
33
 
34
  @spaces.GPU
35
- def user(user_message: str, history: list, system_prompt: str) -> tuple[str, list]:
36
- """Add user message to history"""
37
- if history is None:
38
- history = []
39
- history.append({"role": "user", "content": user_message.strip()})
40
- return "", history
41
-
42
- @spaces.GPU
43
- def bot(
44
- history: list,
45
- system_prompt: str,
46
- max_new_tokens: int = 1024,
47
  temperature: float = 0.7,
48
  top_p: float = 0.9,
49
  top_k: int = 50,
50
  repetition_penalty: float = 1.1,
51
- ) -> Iterator[list]:
52
- """Generate bot response"""
53
  conversation = []
54
  if system_prompt:
55
  conversation.append({"role": "system", "content": system_prompt})
56
-
57
- for message in history:
58
- conversation.append(message)
59
-
60
- try:
61
- input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
62
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
63
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
64
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
65
- input_ids = input_ids.to(model.device)
66
-
67
- streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
68
- generate_kwargs = dict(
69
- input_ids=input_ids,
70
- streamer=streamer,
71
- max_new_tokens=max_new_tokens,
72
- do_sample=True,
73
- top_p=top_p,
74
- top_k=top_k,
75
- temperature=temperature,
76
- num_beams=1,
77
- repetition_penalty=repetition_penalty,
78
- )
79
-
80
- t = Thread(target=model.generate, kwargs=generate_kwargs)
81
- t.start()
82
-
83
- history.append({"role": "assistant", "content": ""})
84
- for text in streamer:
85
- history[-1]["content"] += text
86
- yield history
87
-
88
- except Exception as e:
89
- gr.Warning(f"Error during generation: {str(e)}")
90
- history.append({"role": "assistant", "content": "I apologize, but I encountered an error. Please try again."})
91
- yield history
92
-
93
- def create_demo() -> gr.Blocks:
94
- with gr.Blocks(css="style.css") as demo:
95
- gr.Markdown("# Philosophy Chat with Llama 3.1")
96
- gr.Markdown(DESCRIPTION)
97
- gr.DuplicateButton(
98
- value="Duplicate Space for private use",
99
- elem_id="duplicate-button"
100
- )
101
-
102
- gr.Markdown("<br>")
103
-
104
- chatbot = gr.Chatbot(
105
- show_label=False,
106
- avatar_images=(None, None),
107
- bubble_full_width=False,
108
- )
109
-
110
- with gr.Row():
111
- msg = gr.Textbox(
112
- scale=4,
113
- show_label=False,
114
- placeholder="Enter text and press enter",
115
- container=False,
116
- )
117
- submit = gr.Button("Submit", scale=1, variant="primary")
118
-
119
- system_prompt = gr.Textbox(
120
  label="System prompt",
121
  lines=6,
122
  value="You are a knowledgeable philosophy professor using the Stanford Encyclopedia of Philosophy as your knowledge base. Provide clear, accurate responses using markdown formatting. Focus on philosophical concepts and maintain academic rigor while being accessible. Always cite relevant philosophers and concepts."
123
- )
124
-
125
- with gr.Accordion("Generation Parameters", open=False):
126
- max_new_tokens = gr.Slider(
127
- label="Max new tokens",
128
- minimum=1,
129
- maximum=MAX_MAX_NEW_TOKENS,
130
- step=1,
131
- value=DEFAULT_MAX_NEW_TOKENS,
132
- )
133
- temperature = gr.Slider(
134
- label="Temperature",
135
- minimum=0.1,
136
- maximum=4.0,
137
- step=0.1,
138
- value=0.7,
139
- )
140
- top_p = gr.Slider(
141
- label="Top-p (nucleus sampling)",
142
- minimum=0.05,
143
- maximum=1.0,
144
- step=0.05,
145
- value=0.9,
146
- )
147
- top_k = gr.Slider(
148
- label="Top-k",
149
- minimum=1,
150
- maximum=1000,
151
- step=1,
152
- value=50,
153
- )
154
- repetition_penalty = gr.Slider(
155
- label="Repetition penalty",
156
- minimum=1.0,
157
- maximum=2.0,
158
- step=0.05,
159
- value=1.1,
160
- )
161
-
162
- gr.Examples(
163
- examples=[
164
- "What is the trolley problem and what are its main ethical implications?",
165
- "Can you explain Plato's Theory of Forms?",
166
- "What is the difference between analytic and continental philosophy?",
167
- "How does Kant's Categorical Imperative work?",
168
- "What is the problem of consciousness in philosophy of mind?",
169
- ],
170
- inputs=msg,
171
- fn=None,
172
- outputs=None,
173
- cache_examples=False,
174
- )
175
-
176
- # Chain the user and bot responses
177
- msg.submit(
178
- user,
179
- [msg, chatbot, system_prompt],
180
- [msg, chatbot],
181
- queue=False
182
- ).then(
183
- bot,
184
- [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
185
- chatbot
186
- )
187
-
188
- submit.click(
189
- user,
190
- [msg, chatbot, system_prompt],
191
- [msg, chatbot],
192
- queue=False
193
- ).then(
194
- bot,
195
- [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
196
- chatbot
197
- )
198
-
199
- gr.Markdown(LICENSE)
200
-
201
- return demo
202
 
203
  if __name__ == "__main__":
204
- demo = create_demo()
205
  demo.queue(max_size=20).launch()
 
1
  import os
2
+ from collections.abc import Iterator
3
  from threading import Thread
 
4
 
5
  import gradio as gr
6
  import spaces
 
12
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
13
 
14
  DESCRIPTION = """\
15
+ # Philosophy Chat with Llama 3.1
16
+
17
  This Space showcases the Llama3.1-Instruct-SEP-Chat model from ruggsea, a fine-tuned instruction version of Meta's Llama 3.1 8B model, specifically tailored for philosophical discussions with a formal and informative tone. The model was trained using the Stanford Encyclopedia of Philosophy dataset and carefully crafted prompts.
18
 
19
  Feel free to engage in philosophical discussions and ask questions. The model supports multi-turn conversations and will maintain context.
 
26
  As a derivative work of Llama 3.1, this demo is governed by the original Meta license and acceptable use policy.
27
  """
28
 
 
29
  if torch.cuda.is_available():
30
  model_id = "ruggsea/Llama3.1-Instruct-SEP-Chat"
31
  model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
 
33
  tokenizer.use_default_system_prompt = False
34
 
35
  @spaces.GPU
36
+ def generate(
37
+ message: str,
38
+ chat_history: list[dict],
39
+ system_prompt: str = "",
40
+ max_new_tokens: int = 4000,
 
 
 
 
 
 
 
41
  temperature: float = 0.7,
42
  top_p: float = 0.9,
43
  top_k: int = 50,
44
  repetition_penalty: float = 1.1,
45
+ ) -> Iterator[str]:
 
46
  conversation = []
47
  if system_prompt:
48
  conversation.append({"role": "system", "content": system_prompt})
49
+ conversation += chat_history
50
+ conversation.append({"role": "user", "content": message})
51
+
52
+ input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
53
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
54
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
55
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
56
+ input_ids = input_ids.to(model.device)
57
+
58
+ streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
59
+ generate_kwargs = dict(
60
+ input_ids=input_ids,
61
+ streamer=streamer,
62
+ max_new_tokens=max_new_tokens,
63
+ do_sample=True,
64
+ top_p=top_p,
65
+ top_k=top_k,
66
+ temperature=temperature,
67
+ num_beams=1,
68
+ repetition_penalty=repetition_penalty,
69
+ )
70
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
71
+ t.start()
72
+
73
+ outputs = []
74
+ for text in streamer:
75
+ outputs.append(text)
76
+ yield "".join(outputs)
77
+
78
+ chat_interface = gr.ChatInterface(
79
+ fn=generate,
80
+ additional_inputs=[
81
+ gr.Textbox(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  label="System prompt",
83
  lines=6,
84
  value="You are a knowledgeable philosophy professor using the Stanford Encyclopedia of Philosophy as your knowledge base. Provide clear, accurate responses using markdown formatting. Focus on philosophical concepts and maintain academic rigor while being accessible. Always cite relevant philosophers and concepts."
85
+ ),
86
+ gr.Slider(
87
+ label="Max new tokens",
88
+ minimum=1,
89
+ maximum=MAX_MAX_NEW_TOKENS,
90
+ step=1,
91
+ value=DEFAULT_MAX_NEW_TOKENS,
92
+ ),
93
+ gr.Slider(
94
+ label="Temperature",
95
+ minimum=0.1,
96
+ maximum=4.0,
97
+ step=0.1,
98
+ value=0.7,
99
+ ),
100
+ gr.Slider(
101
+ label="Top-p (nucleus sampling)",
102
+ minimum=0.05,
103
+ maximum=1.0,
104
+ step=0.05,
105
+ value=0.9,
106
+ ),
107
+ gr.Slider(
108
+ label="Top-k",
109
+ minimum=1,
110
+ maximum=1000,
111
+ step=1,
112
+ value=50,
113
+ ),
114
+ gr.Slider(
115
+ label="Repetition penalty",
116
+ minimum=1.0,
117
+ maximum=2.0,
118
+ step=0.05,
119
+ value=1.1,
120
+ ),
121
+ ],
122
+ stop_btn=None,
123
+ examples=[
124
+ ["What is the trolley problem and what are its main ethical implications?"],
125
+ ["Can you explain Plato's Theory of Forms?"],
126
+ ["What is the difference between analytic and continental philosophy?"],
127
+ ["How does Kant's Categorical Imperative work?"],
128
+ ["What is the problem of consciousness in philosophy of mind?"],
129
+ ],
130
+ cache_examples=False,
131
+ type="messages",
132
+ )
133
+
134
+ with gr.Blocks(css="style.css", fill_height=True) as demo:
135
+ gr.Markdown(DESCRIPTION)
136
+ gr.DuplicateButton(
137
+ value="Duplicate Space for private use",
138
+ elem_id="duplicate-button"
139
+ )
140
+ chat_interface.render()
141
+ gr.Markdown(LICENSE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  if __name__ == "__main__":
 
144
  demo.queue(max_size=20).launch()