Llama3-philosophy-demo

Sleeping

App Files Files Community

ruggsea commited on Jan 14

Commit

f97c568

1 Parent(s): d6f5bef

Fixing the chat history

Browse files

Files changed (1) hide show

app.py +39 -31

app.py CHANGED Viewed

@@ -24,9 +24,6 @@ LICENSE = """
 As a derivative work of Llama 3.1, this demo is governed by the original Meta license and acceptable use policy.
 """
-if not torch.cuda.is_available():
-    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
 # Initialize model and tokenizer
 if torch.cuda.is_available():
     model_id = "ruggsea/Llama3.1-Instruct-SEP-Chat"
@@ -35,30 +32,30 @@ if torch.cuda.is_available():
     tokenizer.use_default_system_prompt = False
 @spaces.GPU
-def generate(
-    message: str,
-    chat_history: list[tuple[str, str]],
     system_prompt: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.7,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.1,
-) -> Iterator[list[tuple[str, str]]]:
-    if chat_history is None:
-        chat_history = []
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
-    for user, assistant in chat_history:
-        conversation.extend([
-            {"role": "user", "content": str(user).strip()},
-            {"role": "assistant", "content": str(assistant).strip()}
-        ])
-    conversation.append({"role": "user", "content": str(message).strip()})
     try:
         input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
@@ -83,17 +80,15 @@ def generate(
         t = Thread(target=model.generate, kwargs=generate_kwargs)
         t.start()
-        outputs = []
         for text in streamer:
-            outputs.append(text)
-            partial_output = "".join(outputs)
-            chat_history = chat_history + [(message, partial_output)]
-            yield chat_history
     except Exception as e:
         gr.Warning(f"Error during generation: {str(e)}")
-        chat_history = chat_history + [(message, "I apologize, but I encountered an error. Please try again.")]
-        yield chat_history
 def create_demo() -> gr.Blocks:
     with gr.Blocks(css="style.css") as demo:
@@ -109,6 +104,7 @@ def create_demo() -> gr.Blocks:
         chatbot = gr.Chatbot(
             show_label=False,
             avatar_images=(None, None),
         )
         with gr.Row():
@@ -119,7 +115,7 @@ def create_demo() -> gr.Blocks:
                 container=False,
             )
             submit = gr.Button("Submit", scale=1, variant="primary")
         system_prompt = gr.Textbox(
             label="System prompt",
             lines=6,
@@ -177,15 +173,27 @@ def create_demo() -> gr.Blocks:
             cache_examples=False,
         )
         msg.submit(
-            generate,
-            [msg, chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-            [chatbot],
         )
         submit.click(
-            generate,
-            [msg, chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-            [chatbot],
         )
         gr.Markdown(LICENSE)

 As a derivative work of Llama 3.1, this demo is governed by the original Meta license and acceptable use policy.
 """
 # Initialize model and tokenizer
 if torch.cuda.is_available():
     model_id = "ruggsea/Llama3.1-Instruct-SEP-Chat"
     tokenizer.use_default_system_prompt = False
 @spaces.GPU
+def user(user_message: str, history: list, system_prompt: str) -> tuple[str, list]:
+    """Add user message to history"""
+    if history is None:
+        history = []
+    history.append({"role": "user", "content": user_message.strip()})
+    return "", history
+@spaces.GPU
+def bot(
+    history: list,
     system_prompt: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.7,
     top_p: float = 0.9,
     top_k: int = 50,
     repetition_penalty: float = 1.1,
+) -> Iterator[list]:
+    """Generate bot response"""
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
+    for message in history:
+        conversation.append(message)
     try:
         input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
         t = Thread(target=model.generate, kwargs=generate_kwargs)
         t.start()
+        history.append({"role": "assistant", "content": ""})
         for text in streamer:
+            history[-1]["content"] += text
+            yield history
     except Exception as e:
         gr.Warning(f"Error during generation: {str(e)}")
+        history.append({"role": "assistant", "content": "I apologize, but I encountered an error. Please try again."})
+        yield history
 def create_demo() -> gr.Blocks:
     with gr.Blocks(css="style.css") as demo:
         chatbot = gr.Chatbot(
             show_label=False,
             avatar_images=(None, None),
+            bubble_full_width=False,
         )
         with gr.Row():
                 container=False,
             )
             submit = gr.Button("Submit", scale=1, variant="primary")
         system_prompt = gr.Textbox(
             label="System prompt",
             lines=6,
             cache_examples=False,
         )
+        # Chain the user and bot responses
         msg.submit(
+            user,
+            [msg, chatbot, system_prompt],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            bot,
+            [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+            chatbot
         )
         submit.click(
+            user,
+            [msg, chatbot, system_prompt],
+            [msg, chatbot],
+            queue=False
+        ).then(
+            bot,
+            [chatbot, system_prompt, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+            chatbot
         )
         gr.Markdown(LICENSE)