Spaces:

yasserrmd
/

SoftwareArchitecture-Chat

Running on Zero

App Files Files Community

yasserrmd commited on 11 days ago

Commit

1dbadd4

verified ·

1 Parent(s): 57a3b13

Create app.py

Browse files

Files changed (1) hide show

app.py +222 -0

app.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import os
+import threading
+import time
+import torch
+import gradio as gr
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TextIteratorStreamer,
+)
+MODEL_ID = os.getenv("MODEL_ID", "yasserrmd/SoftwareArchitecture-Instruct-v1")
+# -------- Load model & tokenizer --------
+print(f"Loading model: {MODEL_ID}")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+)
+model.eval()
+# Ensure a pad token to avoid warnings on some bases
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token = tokenizer.eos_token
+TITLE = "SoftwareArchitecture-Instruct v1 — Chat"
+DESCRIPTION = (
+    "An instruction-tuned LLM for **software architecture**. "
+    "Built on LiquidAI/LFM2-1.2B, fine-tuned with the Software-Architecture dataset. "
+    "Designed for technical professionals: accurate, detailed, and on-topic answers."
+)
+SAMPLES = [
+    "Explain the API Gateway pattern and when to use it.",
+    "CQRS vs Event Sourcing — how do they relate, and when would you combine them?",
+    "Design a resilient payment workflow with retries, idempotency keys, and DLQ.",
+    "Rate limiting strategies for a public REST API: token bucket vs sliding window.",
+    "Multi-tenant SaaS: compare shared DB, schema, and dedicated DB for isolation.",
+    "Blue/green vs canary deployments — trade-offs and where each fits best.",
+]
+def format_history_as_messages(history):
+    """
+    Convert Gradio chat history into OpenAI-style messages for apply_chat_template.
+    history: list of tuples (user, assistant)
+    """
+    messages = []
+    for (u, a) in history:
+        if u:
+            messages.append({"role": "user", "content": u})
+        if a:
+            messages.append({"role": "assistant", "content": a})
+    return messages
+def stream_generate(messages, max_new_tokens, temperature, top_p, repetition_penalty, seed=None):
+    """
+    Stream text from model.generate using TextIteratorStreamer.
+    """
+    if seed is not None and seed >= 0:
+        torch.manual_seed(seed)
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,   # IMPORTANT for chat models
+        return_tensors="pt",
+        tokenize=True,
+        return_dict=True,
+    )
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    gen_kwargs = dict(
+        **inputs,
+        max_new_tokens=max_new_tokens,
+        temperature=float(temperature),
+        top_p=float(top_p),
+        repetition_penalty=float(repetition_penalty),
+        do_sample=True if temperature > 0 else False,
+        use_cache=True,
+        streamer=streamer,
+    )
+    # Run generation in a thread so we can yield from streamer
+    thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        yield partial_text
+# -------- Gradio callbacks --------
+def chat_respond(user_msg, chat_history, max_new_tokens, temperature, top_p, repetition_penalty, seed):
+    if not user_msg or not user_msg.strip():
+        return gr.update(), chat_history
+    # Add user turn
+    chat_history = chat_history + [(user_msg, None)]
+    # Build messages from full history
+    messages = format_history_as_messages(chat_history)
+    # Stream assistant output
+    stream = stream_generate(
+        messages=messages,
+        max_new_tokens=int(max_new_tokens),
+        temperature=float(temperature),
+        top_p=float(top_p),
+        repetition_penalty=float(repetition_penalty),
+        seed=int(seed) if seed is not None else None,
+    )
+    # Yield progressive updates for the last assistant turn
+    final_assistant_text = ""
+    for chunk in stream:
+        final_assistant_text = chunk
+        yield gr.update(value=chat_history[:-1] + [(user_msg, final_assistant_text)]), ""
+    # Ensure final state returned
+    chat_history[-1] = (user_msg, final_assistant_text)
+    yield gr.update(value=chat_history), ""
+def use_sample(sample, chat_history):
+    return sample, chat_history
+def clear_chat():
+    return []
+# -------- UI --------
+CUSTOM_CSS = """
+:root {
+  --brand: #0ea5e9; /* cyan-500 */
+  --ink: #0b1220;
+}
+.gradio-container {
+  font-family: Inter, ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, "Apple Color Emoji","Segoe UI Emoji";
+}
+#title h1 {
+  font-weight: 700;
+  letter-spacing: -0.02em;
+}
+#desc {
+  opacity: 0.9;
+}
+footer {visibility: hidden}
+"""
+with gr.Blocks(css=CUSTOM_CSS, theme=gr.themes.Soft(primary_hue="cyan")) as demo:
+    with gr.Row():
+        with gr.Column():
+            gr.HTML(f"<div id='title'><h1>{TITLE}</h1></div>")
+            gr.Markdown(f"<div id='desc'>{DESCRIPTION}</div>", elem_id="desc")
+    with gr.Row():
+        with gr.Column(scale=4):
+            chat = gr.Chatbot(
+                label="SoftwareArchitecture-Instruct v1",
+                avatar_images=(None, None),
+                height=480,
+                bubble_full_width=False,
+                likeable=False,
+                sanitize_html=False,
+            )
+            with gr.Row():
+                user_box = gr.Textbox(
+                    placeholder="Ask about software architecture…",
+                    show_label=False,
+                    lines=3,
+                    autofocus=True,
+                    scale=4,
+                )
+                send_btn = gr.Button("Send", variant="primary", scale=1)
+            with gr.Accordion("Generation Settings", open=False):
+                max_new_tokens = gr.Slider(64, 1024, value=256, step=16, label="Max new tokens")
+                temperature = gr.Slider(0.0, 1.5, value=0.3, step=0.05, label="Temperature")
+                top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
+                repetition_penalty = gr.Slider(1.0, 1.5, value=1.05, step=0.01, label="Repetition penalty")
+                seed = gr.Number(value=-1, precision=0, label="Seed (-1 for random)")
+            with gr.Row():
+                clear_btn = gr.Button("Clear", variant="secondary")
+                # sample buttons
+                sample_dropdown = gr.Dropdown(choices=SAMPLES, label="Samples", value=None)
+                use_sample_btn = gr.Button("Use Sample")
+        with gr.Column(scale=2):
+            gr.Markdown("### Samples")
+            gr.Markdown("\n".join([f"• {s}" for s in SAMPLES]))
+            gr.Markdown("—\n**Tip:** Increase *Max new tokens* for longer, more complete answers.")
+    # Events
+    send_btn.click(
+        chat_respond,
+        inputs=[user_box, chat, max_new_tokens, temperature, top_p, repetition_penalty, seed],
+        outputs=[chat, user_box],
+        queue=True,
+        show_progress=True,
+    )
+    user_box.submit(
+        chat_respond,
+        inputs=[user_box, chat, max_new_tokens, temperature, top_p, repetition_penalty, seed],
+        outputs=[chat, user_box],
+        queue=True,
+        show_progress=True,
+    )
+    clear_btn.click(fn=clear_chat, outputs=chat)
+    use_sample_btn.click(use_sample, inputs=[sample_dropdown, chat], outputs=[user_box, chat])
+    gr.Markdown(
+        "—\nBuilt for engineers and architects. Base model: **LiquidAI/LFM2-1.2B** · Fine-tuned: **Software-Architecture** dataset."
+    )
+if __name__ == "__main__":
+    demo.queue().launch()