File size: 3,008 Bytes
2700a19 fdae8b1 b27f4e9 fdae8b1 56659bb ce0b3e9 fdae8b1 ce0b3e9 56659bb 2700a19 fdae8b1 56659bb 2700a19 56659bb 2700a19 56659bb 2700a19 56659bb 2700a19 56659bb 2700a19 fdae8b1 ce0b3e9 fdae8b1 ce0b3e9 fdae8b1 ce0b3e9 fdae8b1 248f6a0 fdae8b1 ce0b3e9 fdae8b1 ce0b3e9 fdae8b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
from llama_cpp import Llama
# Load models
llm = Llama.from_pretrained(
repo_id="Robzy/lora_model_CodeData_120k",
filename="unsloth.Q4_K_M.gguf",
)
llm2 = Llama.from_pretrained(
repo_id="Robzy/lora_model_CodeData_120k",
filename="unsloth.Q5_K_M.gguf",
)
llm3 = Llama.from_pretrained(
repo_id="Robzy/lora_model_CodeData_120k",
filename="unsloth.Q8_0.gguf",
)
# Define prediction functions
def predict(message, history, model):
messages = [{"role": "system", "content": "You are a helpful assistant."}]
for user_message, bot_message in history:
if user_message:
messages.append({"role": "user", "content": user_message})
if bot_message:
messages.append({"role": "assistant", "content": bot_message})
messages.append({"role": "user", "content": message})
response = ""
for chunk in llm.create_chat_completion(
stream=True,
messages=messages,
):
part = chunk["choices"][0]["delta"].get("content", None)
if part:
response += part
yield response
def predict2(message, history, model):
messages = [{"role": "system", "content": "You are a helpful assistant."}]
for user_message, bot_message in history:
if user_message:
messages.append({"role": "user", "content": user_message})
if bot_message:
messages.append({"role": "assistant", "content": bot_message})
messages.append({"role": "user", "content": message})
response = ""
for chunk in llm2.create_chat_completion(
stream=True,
messages=messages,
):
part = chunk["choices"][0]["delta"].get("content", None)
if part:
response += part
yield response
def predict3(message, history, model):
messages = [{"role": "system", "content": "You are a helpful assistant."}]
for user_message, bot_message in history:
if user_message:
messages.append({"role": "user", "content": user_message})
if bot_message:
messages.append({"role": "assistant", "content": bot_message})
messages.append({"role": "user", "content": message})
response = ""
for chunk in llm3.create_chat_completion(
stream=True,
messages=messages,
):
part = chunk["choices"][0]["delta"].get("content", None)
if part:
response += part
yield response
# Define ChatInterfaces
io1 = gr.ChatInterface(predict, title="4-bit")
io2 = gr.ChatInterface(predict2, title="5-bit") # Placeholder
io3 = gr.ChatInterface(predict3, title="8-bit")
# Dropdown and visibility mapping
chat_interfaces = {"4-bit": io1, "5-bit": io2, "8-bit": io3}
# Define UI
with gr.Blocks() as demo:
gr.Markdown("# Quantized Llama Comparison for Code Generation")
with gr.Tab("4-bit"):
io1.render()
with gr.Tab("5-bit"):
io2.render()
with gr.Tab("8-bit"):
io3.render()
demo.launch()
|