Spaces:

Corvius
/

LLaMA-3.1-405B-Instruct

Runtime error

App Files Files Community

Corvius commited on Aug 13, 2024

Commit

c7a2372

verified ·

1 Parent(s): 1647f17

+2 405B instances ez clap + stochastic gradio spergasm prevention system

Browse files

Files changed (1) hide show

app.py +63 -78

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import gradio as gr
-import requests
 import json
-import threading
 import os
 import datetime
-import queue
 import time
-from requests.exceptions import RequestException
 API_URL = os.environ.get('API_URL')
 API_KEY = os.environ.get('API_KEY')
@@ -26,21 +25,15 @@ DEFAULT_PARAMS = {
     "max_tokens": 512
 }
-class ThreadLocalStorage:
-    def __init__(self):
-        self.stop_generation = False
-        self.active_requests = set()
-        self.lock = threading.Lock()
-thread_local = ThreadLocalStorage()
 def get_timestamp():
     return datetime.datetime.now().strftime("%H:%M:%S")
-def predict(message, history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
-    with thread_local.lock:
-        thread_local.stop_generation = False
     history_format = [{"role": "system", "content": system_prompt}]
     for human, assistant in history:
         history_format.append({"role": "user", "content": human})
@@ -81,44 +74,28 @@ def predict(message, history, system_prompt, temperature, top_p, top_k, frequenc
         "max_tokens": max_tokens
     }
-    try:
-        with requests.post(API_URL, headers=headers, data=json.dumps(data), stream=True) as response:
-            with thread_local.lock:
-                thread_local.active_requests.add(response)
             partial_message = ""
-            for line in response.iter_lines():
-                with thread_local.lock:
-                    if thread_local.stop_generation:
-                        return partial_message
-                if line:
-                    line = line.decode('utf-8')
-                    if line.startswith("data: "):
-                        if line.strip() == "data: [DONE]":
-                            break
-                        try:
-                            json_data = json.loads(line[6:])
-                            if 'choices' in json_data and json_data['choices']:
-                                content = json_data['choices'][0]['delta'].get('content', '')
-                                if content:
-                                    partial_message += content
-                                    yield partial_message
-                        except json.JSONDecodeError:
-                            continue
-        if partial_message:
-            yield partial_message
-    except RequestException as e:
-        error_message = f"Request error: {str(e)}"
-        print(error_message)
-        yield error_message
-    except Exception as e:
-        error_message = f"Unexpected error: {str(e)}"
-        print(error_message)
-        yield error_message
-    finally:
-        with thread_local.lock:
-            thread_local.active_requests.discard(response)
 def import_chat(custom_format_string):
     try:
@@ -154,17 +131,10 @@ def export_chat(history, system_prompt):
                 export_data += f"<|assistant|> {assistant_msg}\n\n"
     return export_data
-def stop_generation_func():
-    with thread_local.lock:
-        thread_local.stop_generation = True
-        for request in thread_local.active_requests:
-            try:
-                request.close()
-            except Exception as e:
-                print(f"Error closing request: {str(e)}")
-        thread_local.active_requests.clear()
-    time.sleep(0.1)
-    return gr.update(), gr.update()
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
     with gr.Row():
@@ -196,16 +166,19 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
         history = history or []
         return "", history + [[user_message, None]]
-    def bot(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
         history = history or []
         if not history:
-            return history
         user_message = history[-1][0]
         bot_message = predict(user_message, history[:-1], system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens)
         history[-1][1] = ""
         try:
-            for chunk in bot_message:
-                if thread_local.stop_generation:
                     break
                 history[-1][1] = chunk
                 yield history
@@ -213,45 +186,57 @@ with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
             print(f"Error in bot function: {str(e)}")
             history[-1][1] = "An error occurred while generating the response."
             yield history
-    def regenerate_response(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
         if history and len(history) > 0:
             last_user_message = history[-1][0]
-            history[-1][1] = None
-            for new_history in bot(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
                 yield new_history
         else:
             yield []
     def import_chat_wrapper(custom_format_string):
         imported_history, imported_system_prompt = import_chat(custom_format_string)
         return imported_history, imported_system_prompt
-    submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot]).then(
-        bot, [chatbot, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens], chatbot
     )
-    clear.click(lambda: None, None, chatbot)
     regenerate_event = regenerate.click(
         regenerate_response,
         [chatbot, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens],
-        chatbot
     )
     stop_btn.click(
-        stop_generation_func,
         inputs=[],
-        outputs=[chatbot, msg]
     )
-    import_button.click(import_chat_wrapper, inputs=[import_textbox], outputs=[chatbot, system_prompt])
     export_button.click(
         export_chat,
         inputs=[chatbot, system_prompt],
-        outputs=[import_textbox]
     )
 if __name__ == "__main__":
-    demo.launch(debug=True, server_name="0.0.0.0", server_port=7860, share=True)

 import gradio as gr
+import aiohttp
+import asyncio
 import json
 import os
 import datetime
 import time
+from concurrent.futures import ThreadPoolExecutor
 API_URL = os.environ.get('API_URL')
 API_KEY = os.environ.get('API_KEY')
     "max_tokens": 512
 }
+thread_pool = ThreadPoolExecutor(max_workers=10)
 def get_timestamp():
     return datetime.datetime.now().strftime("%H:%M:%S")
+should_stop = False
+async def predict(message, history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
+    global should_stop
     history_format = [{"role": "system", "content": system_prompt}]
     for human, assistant in history:
         history_format.append({"role": "user", "content": human})
         "max_tokens": max_tokens
     }
+    async with aiohttp.ClientSession() as session:
+        async with session.post(API_URL, headers=headers, json=data) as response:
             partial_message = ""
+            async for line in response.content:
+                if should_stop:
+                    break
+                line = line.decode('utf-8')
+                if line.startswith("data: "):
+                    if line.strip() == "data: [DONE]":
+                        break
+                    try:
+                        json_data = json.loads(line[6:])
+                        if 'choices' in json_data and json_data['choices']:
+                            content = json_data['choices'][0]['delta'].get('content', '')
+                            if content:
+                                partial_message += content
+                                yield partial_message
+                    except json.JSONDecodeError:
+                        continue
+    if partial_message:
+        yield partial_message
 def import_chat(custom_format_string):
     try:
                 export_data += f"<|assistant|> {assistant_msg}\n\n"
     return export_data
+def stop_generation():
+    global should_stop
+    should_stop = True
+    return gr.update(interactive=True), gr.update(interactive=True)
 with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
     with gr.Row():
         history = history or []
         return "", history + [[user_message, None]]
+    async def bot(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
+        global should_stop
+        should_stop = False
         history = history or []
         if not history:
+            yield history
+            return
         user_message = history[-1][0]
         bot_message = predict(user_message, history[:-1], system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens)
         history[-1][1] = ""
         try:
+            async for chunk in bot_message:
+                if should_stop:
                     break
                 history[-1][1] = chunk
                 yield history
             print(f"Error in bot function: {str(e)}")
             history[-1][1] = "An error occurred while generating the response."
             yield history
+        finally:
+            should_stop = False
+    async def regenerate_response(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
+        global should_stop
+        should_stop = False
         if history and len(history) > 0:
             last_user_message = history[-1][0]
+            history[-1][1] = None
+            async for new_history in bot(history, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens):
+                if should_stop:
+                    break
                 yield new_history
         else:
             yield []
+        should_stop = False
     def import_chat_wrapper(custom_format_string):
         imported_history, imported_system_prompt = import_chat(custom_format_string)
         return imported_history, imported_system_prompt
+    submit_event = msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
+        bot, [chatbot, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens], chatbot,
+        concurrency_limit=10
     )
+    clear.click(lambda: None, None, chatbot, queue=False)
     regenerate_event = regenerate.click(
         regenerate_response,
         [chatbot, system_prompt, temperature, top_p, top_k, frequency_penalty, presence_penalty, repetition_penalty, max_tokens],
+        chatbot,
+        concurrency_limit=10
     )
     stop_btn.click(
+        stop_generation,
         inputs=[],
+        outputs=[msg, regenerate],
+        cancels=[submit_event, regenerate_event],
+        queue=False
     )
+    import_button.click(import_chat_wrapper, inputs=[import_textbox], outputs=[chatbot, system_prompt], queue=False)
     export_button.click(
         export_chat,
         inputs=[chatbot, system_prompt],
+        outputs=[import_textbox],
+        queue=False
     )
 if __name__ == "__main__":
+    demo.launch(debug=True, server_name="0.0.0.0", server_port=7860, share=True, max_threads=40)