import os import json from typing import List, Tuple from collections import OrderedDict import gradio as gr from shared import Client config = json.loads(os.environ['CONFIG']) model_names = list(config.keys()) clients = {} for name in config: model_personas = config[name].get("personas", {}) client = Client( api_url=os.environ[config[name]['api_url']], api_key=os.environ[config[name]['api_key']], personas=model_personas ) clients[name] = client personas = list(OrderedDict.fromkeys(persona for name in model_names for persona in clients[name].personas)) info = "\n\n".join([f"{model} ({clients[model].vllm_model_name}):\n{list(clients[model].personas.keys())}" for model in model_names]) def respond( message, history: List[Tuple[str, str]], persona, model, info, conversational, max_tokens, ): client = clients[model] messages = [] try: system_prompt = client.personas[persona] except KeyError: supported_personas = list(client.personas.keys()) raise gr.Error(f"Model '{model}' does not support persona '{persona}', only {supported_personas}") if system_prompt is not None: messages.append({"role": "system", "content": system_prompt}) if conversational: for val in history[-2:]: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) completion = client.openai.chat.completions.create( model=client.vllm_model_name, messages=messages, max_tokens=max_tokens, temperature=0, extra_body={ "repetition_penalty": 1.05, "use_beam_search": True, "best_of": 5, }, ) response = completion.choices[0].message.content return response demo = gr.ChatInterface( respond, additional_inputs=[ gr.Radio(choices=personas, value="default", label="persona"), gr.Radio(choices=model_names, value="stable", label="model"), gr.Textbox(value=info, interactive=False, label="info"), gr.Checkbox(value=True, label="conversational"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), ], additional_inputs_accordion=gr.Accordion(label="Config", open=True), title="NeonLLM (v2024-07-03)", concurrency_limit=5, ) if __name__ == "__main__": demo.launch()