Spaces:

Krish45
/

JARVIS

Sleeping

App Files Files Community

Krish45 commited on 8 days ago

Commit

dd3768c

verified ·

1 Parent(s): 74f9277

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -26

app.py CHANGED Viewed

@@ -1,24 +1,41 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 model_name = "Qwen/Qwen2.5-0.5B-Instruct"
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name, low_cpu_mem_usage=True, device_map="auto", torch_dtype="auto"
-)
 def predict(history, message):
-    """
-    history: list of [user, bot] message pairs from the Chatbot
-    message: new user input string
-    """
-    # Add the latest user message to the conversation
-    history = history or []  # make sure it's a list
     history.append((message, ""))
-    # Convert to messages format for Qwen
     messages = []
     for human, bot in history:
         if human:
@@ -26,28 +43,47 @@ def predict(history, message):
         if bot:
             messages.append({"role": "assistant", "content": bot})
-    # Apply chat template
     text = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    # Generate response
-    generated_ids = model.generate(**model_inputs, max_new_tokens=512)
-    generated_ids = [
-        output_ids[len(input_ids):]
-        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-    ]
-    reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Update last message with bot reply
     history[-1] = (message, reply)
-    return history, ""  # return history + clear textbox
 with gr.Blocks() as demo:
-    chatbot = gr.Chatbot()
-    msg = gr.Textbox(placeholder="Type your message here...")
-    msg.submit(predict, [chatbot, msg], [chatbot, msg])
-demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+import threading
+import time
+import os
+# Model config
 model_name = "Qwen/Qwen2.5-0.5B-Instruct"
+offload_dir = "offload"
+# Global variables
+tokenizer = None
+model = None
+model_lock = threading.Lock()
+# Lazy-load the model with quantization & offloading
+def load_model():
+    global tokenizer, model
+    if model is None:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Ensure offload folder exists
+        os.makedirs(offload_dir, exist_ok=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            load_in_8bit=True,           # Quantize to 8-bit
+            device_map="auto",
+            offload_folder=offload_dir,  # Offload some weights to disk
+            torch_dtype=torch.float16
+        )
+# Chatbot prediction function
 def predict(history, message):
+    load_model()
+    history = history or []
     history.append((message, ""))
+    # Convert to Qwen message format
     messages = []
     for human, bot in history:
         if human:
         if bot:
             messages.append({"role": "assistant", "content": bot})
     text = tokenizer.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    reply = ""
+    try:
+        with model_lock:  # Serialize CPU inference safely
+            with torch.no_grad():
+                start = time.time()
+                generated_ids = model.generate(**model_inputs, max_new_tokens=256)
+                if time.time() - start > 30:  # 30s timeout
+                    reply = "[Response timed out]"
+                else:
+                    # Remove input_ids from output
+                    generated_ids = [
+                        output_ids[len(input_ids):]
+                        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+                    ]
+                    reply = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    except Exception as e:
+        reply = f"[Error: {str(e)}]"
     history[-1] = (message, reply)
+    return history, ""
+# Keep-alive endpoint for local client ping
+def keep_alive(msg="ping"):
+    return "pong"
+# Gradio UI
 with gr.Blocks() as demo:
+    with gr.Tab("Chatbot"):
+        chatbot = gr.Chatbot()
+        msg = gr.Textbox(placeholder="Type your message here...")
+        msg.submit(predict, [chatbot, msg], [chatbot, msg])
+    with gr.Tab("Keep Alive"):
+        gr.Textbox(label="Ping", value="ping", interactive=False)
+        gr.Button("Ping").click(keep_alive, inputs=None, outputs=None)
+# Multi-user queue with concurrency
+demo.queue(concurrency_count=4, max_size=8)  # 4 simultaneous, 8 waiting
+demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)