Spaces:

MuntasirHossain
/

Fine-tuned-Llama-3-8B-Chatbot

Running

App Files Files Community

MuntasirHossain commited on May 17, 2024

Commit

a947bc1

verified ·

1 Parent(s): a9cc21f

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -36

app.py CHANGED Viewed

@@ -2,9 +2,7 @@ import gradio as gr
 import os
 import requests
 from llama_cpp import Llama
-from transformers import AutoTokenizer
-import transformers
-import torch
 llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
 llm_path = os.path.basename(llm_name)
@@ -12,15 +10,9 @@ llm_path = os.path.basename(llm_name)
 # download gguf model
 def download_llms(llm_name):
     """Download GGUF model"""
     download_url = ""
     print("Downloading " + llm_name)
     download_url = "https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/Q4_K_M.gguf"
-    # elif selected_llm == 'microsoft/Phi-3-mini-4k-instruct':
-    #     download_url = "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf"
-    # elif selected_llm == 'mistralai/Mistral-7B-Instruct-v0.2':
-    #     download_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q2_K.gguf"
     if not os.path.exists("model"):
         os.makedirs("model")
@@ -43,42 +35,33 @@ def download_llms(llm_name):
             print(f"Model download completed {response.status_code}")
 # define model pipeline with llama-cpp
-def initialize_llm(llm_model):
     model_path = ""
     if llm_model == llm_name:
         model_path = "model/Q4_K_M.gguf"
         download_llms(llm_model)
     llm = Llama(
         model_path=model_path,
-        # temperature=temperature,
-        # max_tokens=256,
-        # top_p=1,
-        # top_k= top_k,
-        n_ctx=1024,
         verbose=False
         )
     return llm
 llm = initialize_llm(llm_name)
-# format prompt as per the chat template on the official model page: https://huggingface.co/google/gemma-7b-it
 def format_prompt(input_text, history):
-    system_prompt = "You are a helpful AI assistant. You are truthful in your response."
     prompt = ""
     if history:
         for previous_prompt, response in history:
             prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
-            # <start_of_turn>user
-            # {previous_prompt}<end_of_turn>
-            # <start_of_turn>model
-            # {response}<end_of_turn>
     prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
-    # <start_of_turn>user
-    # {input_text}<end_of_turn>
-    # <start_of_turn>model"""
     return prompt
-def generate(prompt, history, max_new_tokens=256): # temperature=0.95, top_p=0.9, repetition_penalty=1.0
     if not history:
         history = []
@@ -89,39 +72,35 @@ def generate(prompt, history, max_new_tokens=256): # temperature=0.95, top_p=0.9
         # temperature=temperature,
         max_tokens=max_new_tokens,
         # top_p=top_p,
-        # repetition_penalty=repetition_penalty,
-        # do_sample=True,
         stop=["<|im_end|>"]
     )
     formatted_prompt = format_prompt(prompt, history)
     response = llm(formatted_prompt, **kwargs, stream=True)
     output = ""
     for chunk in response:
         output += chunk['choices'][0]['text']
         yield output
     return output
-    # output = ""
-    # for chunk in response:
-    #     output += chunk.token.text
-    #     yield output
-    # return output
     # response = llm(formatted_prompt, **kwargs)
     # return response['choices'][0]['text']
 chatbot = gr.Chatbot(height=500)
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B</h1><center>")
     gr.ChatInterface(
         generate,
         chatbot=chatbot,
         retry_btn=None,
         undo_btn=None,
         clear_btn="Clear",
-        description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation",
         # additional_inputs=additional_inputs,
-        examples=[["Explain artificial intelligence in a few lines."]]
     )
 demo.queue().launch()

 import os
 import requests
 from llama_cpp import Llama
 llm_name = "MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF"
 llm_path = os.path.basename(llm_name)
 # download gguf model
 def download_llms(llm_name):
     """Download GGUF model"""
     download_url = ""
     print("Downloading " + llm_name)
     download_url = "https://huggingface.co/MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF/resolve/main/Q4_K_M.gguf"
     if not os.path.exists("model"):
         os.makedirs("model")
             print(f"Model download completed {response.status_code}")
 # define model pipeline with llama-cpp
+def initialize_llm(llm_model):
     model_path = ""
     if llm_model == llm_name:
         model_path = "model/Q4_K_M.gguf"
         download_llms(llm_model)
     llm = Llama(
         model_path=model_path,
+        n_ctx=1024, # input text context length, 0 = from model
         verbose=False
         )
     return llm
 llm = initialize_llm(llm_name)
+# format prompt as per the ChatML template. The model was fine-tuned with this chat template
 def format_prompt(input_text, history):
+    system_prompt = """You are a helpful AI assistant. You are truthful in your response for real-world matters
+    but you are also creative for imaginative/fictional tasks."""
     prompt = ""
     if history:
         for previous_prompt, response in history:
             prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{previous_prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"
     prompt += f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant"
     return prompt
+# generate llm response
+def generate(prompt, history, max_new_tokens=256): # temperature=0.95, top_p=0.9
     if not history:
         history = []
         # temperature=temperature,
         max_tokens=max_new_tokens,
         # top_p=top_p,
         stop=["<|im_end|>"]
     )
     formatted_prompt = format_prompt(prompt, history)
+    # generate a streaming response
     response = llm(formatted_prompt, **kwargs, stream=True)
     output = ""
     for chunk in response:
         output += chunk['choices'][0]['text']
         yield output
     return output
+    # # generate response without streaming
     # response = llm(formatted_prompt, **kwargs)
     # return response['choices'][0]['text']
 chatbot = gr.Chatbot(height=500)
+with gr.Blocks(theme=gr.themes.Default(primary_hue="red", secondary_hue="pink")) as demo:
     gr.HTML("<center><h1>Fine-tuned Meta-Llama-3-8B</h1><center>")
+    gr.Markdown("<b>This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.</b>")
     gr.ChatInterface(
         generate,
         chatbot=chatbot,
         retry_btn=None,
         undo_btn=None,
         clear_btn="Clear",
+        # description="This AI agent is using the MuntasirHossain/Meta-Llama-3-8B-OpenOrca-GGUF model for text-generation.",
         # additional_inputs=additional_inputs,
+        examples=[["What is a large language model?"], ["What is the meaning of life?"], ["Write a short fictional story about a planet named 'Orca'."]]
     )
 demo.queue().launch()