MuntasirHossain's picture
Update app.py
f461ba8 verified
raw
history blame
3.81 kB
import gradio as gr
import os
import requests
from llama_cpp import Llama
llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
llm_path = os.path.basename(llm_name)
# download gguf model
def download_llms(llm_name):
"""Download GGUF model"""
download_url = ""
print("Downloading " + llm_name)
download_url = "https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/Q4_K_M.gguf"
if not os.path.exists("model"):
os.makedirs("model")
llm_filename = os.path.basename(download_url)
llm_temp_file_path = os.path.join("model", llm_filename)
if os.path.exists(llm_temp_file_path):
print("Model already available")
else:
response = requests.get(download_url, stream=True)
if response.status_code == 200:
with open(llm_temp_file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
print("Download completed")
else:
print(f"Model download completed {response.status_code}")
# define model pipeline with llama-cpp
def initialize_llm(llm_model):
model_path = ""
if llm_model == llm_name:
model_path = "model/Q4_K_M.gguf"
download_llms(llm_model)
llm = Llama(
model_path=model_path,
n_ctx=1024, # input text context length, 0 = from model
verbose=False
)
return llm
llm = initialize_llm(llm_name)
# format prompt as per the ChatML template. The model was fine-tuned with this chat template
def format_prompt(input_text, history):
system_prompt = """You are a helpful AI assistant. You are truthful in your response for real-world matters
but you are also creative for imaginative/fictional tasks."""
prompt = ""
if history:
for previous_prompt, response in history:
prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
return prompt
# generate llm response
def generate(prompt, history, max_new_tokens=512): # temperature=0.95, top_p=0.9
if not history:
history = []
# temperature = float(temperature)
# top_p = float(top_p)
kwargs = dict(
# temperature=temperature,
max_tokens=max_new_tokens,
# top_p=top_p,
stop=["<|im_end|>"]
)
formatted_prompt = format_prompt(prompt, history)
# generate a streaming response
response = llm(formatted_prompt, **kwargs, stream=True)
output = ""
for chunk in response:
output += chunk['choices'][0]['text']
yield output
return output
# # generate response without streaming
# response = llm(formatted_prompt, **kwargs)
# return response['choices'][0]['text']
chatbot = gr.Chatbot(height=500)
with gr.Blocks(theme=gr.themes.Default(primary_hue="sky")) as demo:
gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B Chatbot</h1><center>")
gr.Markdown("<b>This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.</b>")
gr.ChatInterface(
generate,
chatbot=chatbot,
retry_btn=None,
undo_btn=None,
clear_btn="Clear",
# description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.",
# additional_inputs=additional_inputs,
examples=[["What is a large language model?"], ["What is the meaning of life?"], ["Write a short story about a fictional planet named 'Orca'."]]
)
demo.queue().launch()