import llama_cpp import llama_cpp.llama_tokenizer from llama_cpp import Llama import gradio as gr from loguru import logger import psutil from ctransformers import AutoModelForCausalLM,AutoTokenizer prompt_template = """[INST] <> You are a helpful assistant for a crowdfunding platform called GiveSendGo. Your goal is to gather essential information for campaign and generate a title and sample pitch of atleast 1000 words for the campaign. <> {question} [/INST] """ model_loc = "models/llama-2-13b-chat.Q5_K_M.gguf" model_loc = "TheBloke/Llama-2-13B-chat-GGUF" llama = AutoModelForCausalLM.from_pretrained( model_loc, model_type="llama", context_length=4096, max_new_tokens=2048, hf=True # threads=cpu_count, ) # llama = llama_cpp.Llama.from_pretrained( # #repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", # #filename="*q8_0.gguf", # mode_path=model_loc, # model_type="llama", # context_length=4096, # max_new_tokens=2048, # filename="llama-2-13b-chat.Q5_K_M.gguf", # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"), # verbose=False # ) # llama = Llama( # model_path=model_loc, # max_tokens=4096, # n_ctx=4096, # verbose=False, # ) _ = [elm for elm in prompt_template.splitlines() if elm.strip()] stop_string = [elm.split(":")[0] + ":" for elm in _][-2] logger.debug(f"{stop_string=}") _ = psutil.cpu_count(logical=False) - 1 cpu_count: int = int(_) if _ else 1 logger.debug(f"{cpu_count=}") model = "gpt-3.5-turbo" def predict(message, history): messages = [] prompt = prompt_template.format(question=message) for user_message, assistant_message in history: messages.append({"role": "system", "content": prompt},) messages.append({"role": "user", "content": user_message}) messages.append({"role": "assistant", "content": assistant_message}) messages.append({"role": "user", "content": message}) response = llama.create_chat_completion_openai_v1( model=model, messages=messages, response_format={ "type": "json_object", "schema": { "type": "object", "properties": {"title": {"type": "string"}, #"description": {"type": "string"}, "sample_pitch": {"type": "string"}, "amount": {"type": "string"}, "location": {"type": "string"}}, "required": ["title","sample_pitch","amount","location"], #description }, }, temperature=0.7, stream=True ) text = "" for chunk in response: content = chunk.choices[0].delta.content if content: text += content yield text def generate(message): try: messages = [] prompt = prompt_template.format(question=message) #for user_message, assistant_message in history: messages.append({"role": "system", "content": prompt},) #messages.append({"role": "user", "content": user_message}) #messages.append({"role": "assistant", "content": assistant_message}) messages.append({"role": "user", "content": message}) response = llama.create_chat_completion_openai_v1( model=model, messages=messages, response_format={ "type": "json_object", "schema": { "type": "object", "properties": {"title": {"type": "string"}, #"description": {"type": "string"}, "sample_pitch": {"type": "string"}, "amount": {"type": "string"}, "location": {"type": "string"}}, "required": ["title","sample_pitch","amount","location"], #description }, }, temperature=0.7, stream=False) # text = "" # for chunk in response: # content = chunk.choices[0].delta.content # if content: # text += content # logger.debug(f"api: {content=}") # yield text logger.debug(f"{response}") return response.choices[0].delta.content except Exception as exc: logger.error(exc) response = f"{exc=}" def predict_api(message): logger.debug(f"{message=}") text = generate(message) logger.debug(f"text::{text=}") return f"json: {text=}" js = """function () { gradioURL = window.location.href if (!gradioURL.endsWith('?__theme=dark')) { window.location.replace(gradioURL + '?__theme=dark'); } }""" css = """ footer { visibility: hidden; } full-height { height: 100%; } """ with gr.Blocks(theme=gr.themes.Soft(), js=js, css=css) as demo: gr.ChatInterface(predict, examples=["What is the capital of France?", "Who was the first person on the moon?"]) with gr.Accordion("For Chat/Translation API", open=False, visible=False): input_text = gr.Text() api_btn = gr.Button("Go", variant="primary") out_text = gr.Text() predict_api, input_text, out_text, api_name="api", ) if __name__ == "__main__": demo.queue().launch(debug=True, share=True)