Spaces:

MuntasirHossain
/

Fine-tuned-Llama-3-8B-Chatbot

Running

App Files Files Community

Fine-tuned-Llama-3-8B-Chatbot / app.py

MuntasirHossain

Update app.py

f461ba8 verified 9 months ago

raw

history blame

3.81 kB

	import gradio as gr
	import os
	import requests
	from llama_cpp import Llama

	llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
	llm_path = os.path.basename(llm_name)

	# download gguf model
	def download_llms(llm_name):
	"""Download GGUF model"""
	download_url = ""
	print("Downloading " + llm_name)
	download_url = "https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/Q4_K_M.gguf"

	if not os.path.exists("model"):
	os.makedirs("model")

	llm_filename = os.path.basename(download_url)
	llm_temp_file_path = os.path.join("model", llm_filename)

	if os.path.exists(llm_temp_file_path):
	print("Model already available")
	else:
	response = requests.get(download_url, stream=True)
	if response.status_code == 200:
	with open(llm_temp_file_path, 'wb') as f:
	for chunk in response.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)

	print("Download completed")
	else:
	print(f"Model download completed {response.status_code}")

	# define model pipeline with llama-cpp
	def initialize_llm(llm_model):
	model_path = ""
	if llm_model == llm_name:
	model_path = "model/Q4_K_M.gguf"
	download_llms(llm_model)
	llm = Llama(
	model_path=model_path,
	n_ctx=1024, # input text context length, 0 = from model
	verbose=False
	)
	return llm

	llm = initialize_llm(llm_name)

	# format prompt as per the ChatML template. The model was fine-tuned with this chat template
	def format_prompt(input_text, history):
	system_prompt = """You are a helpful AI assistant. You are truthful in your response for real-world matters
	but you are also creative for imaginative/fictional tasks."""
	prompt = ""
	if history:
	for previous_prompt, response in history:
	prompt += f"<\|im_start\|>system\n{system_prompt}<\|im_end\|>\n<\|im_start\|>user\n{previous_prompt}<\|im_end\|>\n<\|im_start\|>assistant\n{response}<\|im_end\|>"
	prompt += f"<\|im_start\|>system\n{system_prompt}<\|im_end\|>\n<\|im_start\|>user\n{input_text}<\|im_end\|>\n<\|im_start\|>assistant"
	return prompt

	# generate llm response
	def generate(prompt, history, max_new_tokens=512): # temperature=0.95, top_p=0.9
	if not history:
	history = []

	# temperature = float(temperature)
	# top_p = float(top_p)

	kwargs = dict(
	# temperature=temperature,
	max_tokens=max_new_tokens,
	# top_p=top_p,
	stop=["<\|im_end\|>"]
	)

	formatted_prompt = format_prompt(prompt, history)

	# generate a streaming response
	response = llm(formatted_prompt, **kwargs, stream=True)
	output = ""
	for chunk in response:
	output += chunk['choices'][0]['text']
	yield output
	return output

	# # generate response without streaming
	# response = llm(formatted_prompt, **kwargs)
	# return response['choices'][0]['text']

	chatbot = gr.Chatbot(height=500)
	with gr.Blocks(theme=gr.themes.Default(primary_hue="sky")) as demo:
	gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B Chatbot</h1><center>")
	gr.Markdown("<b>This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.</b>")
	gr.ChatInterface(
	generate,
	chatbot=chatbot,
	retry_btn=None,
	undo_btn=None,
	clear_btn="Clear",
	# description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.",
	# additional_inputs=additional_inputs,
	examples=[["What is a large language model?"], ["What is the meaning of life?"], ["Write a short story about a fictional planet named 'Orca'."]]
	)
	demo.queue().launch()