import gradio as gr import os import requests from llama_cpp import Llama llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF" llm_path = os.path.basename(llm_name) gguf_model = "Q4_K_M.gguf" # "Q6_K.gguf" # download gguf model def download_llms(llm_name): """Download GGUF model""" download_url = "" print("Downloading " + llm_name) download_url = f"https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/{gguf_model}" if not os.path.exists("model"): os.makedirs("model") llm_filename = os.path.basename(download_url) llm_temp_file_path = os.path.join("model", llm_filename) if os.path.exists(llm_temp_file_path): print("Model already available") else: response = requests.get(download_url, stream=True) if response.status_code == 200: with open(llm_temp_file_path, 'wb') as f: for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) print("Download completed") else: print(f"Model download unsuccessful {response.status_code}") # define model pipeline with llama-cpp def initialize_llm(llm_model): model_path = "" if llm_model == llm_name: model_path = f"model/{gguf_model}" download_llms(llm_model) llm = Llama( model_path=model_path, n_ctx=1024, # input text context length, 0 = from model verbose=False ) return llm llm = initialize_llm(llm_name) # format prompt as per the ChatML template. The model was fine-tuned with this chat template def format_prompt(input_text, history): system_prompt = """You are a helpful AI assistant. You are truthful in your response for real-world matters but you are also creative for imaginative/fictional tasks.""" # system_prompt = """You are a large language model (LLM). Your primary role is to assist users by providing information, answering questions, # and engaging in conversation. You are expected to be a subject matter expert across a wide range of topics. When asked about a particular subject, # you should provide detailed, accurate, and up-to-date information. # When a question involves reasoning or analysis, you should apply your reasoning and analytical skills to provide a well-thought-out response. # You should strive to be truthful and accurate in your responses, especially when dealing with practical or real-life problems. # However, you are not just a source of information. You are also capable of being creative and imaginative. # When asked to engage in storytelling or to describe fictional characters and objects, you should use your creativity to generate engaging and imaginative content. # Your goal is to understand the user's query and generate the best possible answer. You should adapt your response based on the nature of the query, # whether it requires factual information, analytical reasoning, or creative imagination. You are a great all-rounder, capable of handling # a wide variety of tasks and queries. # """ prompt = "" if history: for previous_prompt, response in history: prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>" prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant" return prompt # generate llm response def generate(prompt, history, max_new_tokens=512): # temperature=0.95, top_p=0.9 if not history: history = [] # temperature = float(temperature) # top_p = float(top_p) kwargs = dict( # temperature=temperature, max_tokens=max_new_tokens, # top_p=top_p, stop=["<|im_end|>"] ) formatted_prompt = format_prompt(prompt, history) # generate a streaming response response = llm(formatted_prompt, **kwargs, stream=True) output = "" for chunk in response: output += chunk['choices'][0]['text'] yield output return output # # generate response without streaming # response = llm(formatted_prompt, **kwargs) # return response['choices'][0]['text'] chatbot = gr.Chatbot(height=500) with gr.Blocks(theme=gr.themes.Default(primary_hue="sky")) as demo: gr.HTML("

Fine-tuned Meta-Llama-3-8B Chatbot

") gr.Markdown("This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.") gr.ChatInterface( generate, chatbot=chatbot, retry_btn=None, undo_btn=None, clear_btn="Clear", # description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.", # additional_inputs=additional_inputs, examples=[["What is a large language model?"], ["What is the meaning of life?"], ["Write a short story about a fictional planet named 'Orca'."]] # examples=[["Julie had 17 plums, she ate 12 of them, threw away 3 of them as they turned bad and bought 5 new. How many plums does she have now?"], # ["Write a short story about a fictional planet named 'Orca'."]] ) demo.queue().launch()