HealthAssistant

Running

App Files Files

reedmayhew commited on Jan 29

Commit

f804d88

verified ·

1 Parent(s): b2f2185

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -34

app.py CHANGED Viewed

@@ -1,14 +1,12 @@
 import gradio as gr
 import os
 import spaces
-from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">DeepSeek-R1-Zero</h1>
@@ -28,7 +26,6 @@ PLACEHOLDER = """
 </div>
 """
 css = """
 h1 {
   text-align: center;
@@ -45,7 +42,8 @@ h1 {
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("reedmayhew/DeepSeek-R1-Refined-Llama-3.1-8B-hf")
-model = AutoModelForCausalLM.from_pretrained("reedmayhew/DeepSeek-R1-Refined-Llama-3.1-8B-hf", device_map="auto")  # to("cuda:0")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
@@ -53,10 +51,10 @@ terminators = [
 @spaces.GPU(duration=30)
 def chat_llama3_8b(message: str,
-              history: list,
-              temperature: float,
-              max_new_tokens: int
-             ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
@@ -67,24 +65,31 @@ def chat_llama3_8b(message: str,
     Returns:
         str: The generated response.
     """
     conversation = []
     for user, assistant in history:
-        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids= input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
-    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
     if temperature == 0:
         generate_kwargs['do_sample'] = False
@@ -92,14 +97,34 @@ def chat_llama3_8b(message: str,
     t.start()
     outputs = []
     for text in streamer:
-        outputs.append(text)
-        #print(outputs)
-        yield "".join(outputs)
 # Gradio block
-chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
@@ -110,31 +135,20 @@ with gr.Blocks(fill_height=True, css=css) as demo:
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
-            gr.Slider(minimum=0.6,
-                      maximum=0.6,
-                      step=0.1,
-                      value=0.6,
-                      label="Temperature",
-                      render=False),
-            gr.Slider(minimum=128,
-                      maximum=4096,
-                      step=64,
-                      value=1024,
-                      label="Max new tokens",
-                      render=False ),
-            ],
         examples=[
             ['How to setup a human base on Mars? Give short answer.'],
             ['Explain theory of relativity to me like I’m 8 years old.'],
             ['What is 9,000 * 9,000?'],
             ['Write a pun-filled happy birthday message to my friend Alex.'],
             ['Justify why a penguin might make a good king of the jungle.']
-            ],
         cache_examples=False,
-                     )
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import os
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 # Set an environment variable
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">DeepSeek-R1-Zero</h1>
 </div>
 """
 css = """
 h1 {
   text-align: center;
 # Load the tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained("reedmayhew/DeepSeek-R1-Refined-Llama-3.1-8B-hf")
+model = AutoModelForCausalLM.from_pretrained("reedmayhew/DeepSeek-R1-Refined-Llama-3.1-8B-hf", device_map="auto")
 terminators = [
     tokenizer.eos_token_id,
     tokenizer.convert_tokens_to_ids("<|eot_id|>")
 @spaces.GPU(duration=30)
 def chat_llama3_8b(message: str,
+                    history: list,
+                    temperature: float,
+                    max_new_tokens: int
+                   ) -> str:
     """
     Generate a streaming response using the llama3-8b model.
     Args:
     Returns:
         str: The generated response.
     """
     conversation = []
     for user, assistant in history:
+        conversation.extend([
+            {"role": "user", "content": user},
+            {"role": "assistant", "content": assistant}
+        ])
+    # Ensure the model starts with "<think>"
     conversation.append({"role": "user", "content": message})
+    conversation.append({"role": "assistant", "content": "<think> "})  # Force <think> at start
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
         eos_token_id=terminators,
     )
     if temperature == 0:
         generate_kwargs['do_sample'] = False
     t.start()
     outputs = []
+    buffer = ""
+    think_detected = False
+    thinking_message_sent = False
+    full_response = ""  # Store the full assistant response
     for text in streamer:
+        buffer += text
+        full_response += text  # Store raw assistant response (includes <think>)
+        # Send the "thinking" message once text starts generating
+        if not thinking_message_sent:
+            thinking_message_sent = True
+            yield "DeepSeek R1 is Thinking...\n\n"
+        # Wait until </think> is detected before streaming output
+        if not think_detected:
+            if "</think>" in buffer:
+                think_detected = True
+                buffer = buffer.split("</think>", 1)[1]  # Remove <think> section
+        else:
+            outputs.append(text)
+            yield "".join(outputs)
+    # Store the full response (including <think>) in history, but only show the user the cleaned response
+    history.append((message, full_response))  # Full assistant response saved for context
 # Gradio block
+chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
 with gr.Blocks(fill_height=True, css=css) as demo:
         fill_height=True,
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
+            gr.Slider(minimum=0.6, maximum=0.6, step=0.1, value=0.6, label="Temperature", render=False),
+            gr.Slider(minimum=128, maximum=4096, step=64, value=1024, label="Max new tokens", render=False),
+        ],
         examples=[
             ['How to setup a human base on Mars? Give short answer.'],
             ['Explain theory of relativity to me like I’m 8 years old.'],
             ['What is 9,000 * 9,000?'],
             ['Write a pun-filled happy birthday message to my friend Alex.'],
             ['Justify why a penguin might make a good king of the jungle.']
+        ],
         cache_examples=False,
+    )
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    demo.launch()