File size: 5,418 Bytes
f88a5b9 8f689f0 6936b25 f88a5b9 6936b25 f88a5b9 a28de04 f88a5b9 a28de04 f88a5b9 a28de04 f88a5b9 a28de04 f88a5b9 a28de04 d497e2a a28de04 5211f32 ecb4862 5211f32 a28de04 d497e2a a28de04 d497e2a a28de04 d497e2a a28de04 f88a5b9 5fdb51d f88a5b9 ef264c8 f88a5b9 5fdb51d f88a5b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import os
import json
import subprocess
import gradio as gr
from threading import Thread
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from datetime import datetime
# Load model from Hugging Face Hub
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
MODEL_FILE = "model-Q8_0.gguf"
model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)
# Initialize Llama model
llama = Llama(
model_path=model_path_file,
n_gpu_layers=40, # Adjust based on VRAM
n_threads=8, # Match CPU cores
n_batch=512, # Optimize for better VRAM usage
n_ctx=4096, # Context window size
verbose=True # Enable debug logging
)
## Function to generate responses
# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
# # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
# chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
# response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
# text = ""
# for chunk in response:
# content = chunk["choices"][0]["text"]
# if content:
# text += content
# yield text
# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
# """Generates a streaming response from the Llama model."""
# messages = [
# {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
# ]
# # Add history and the current message
# for user, bot in history:
# messages.append({"role": "user", "content": user})
# messages.append({"role": "assistant", "content": bot})
# messages.append({"role": "user", "content": message})
# response = llama.create_chat_completion(
# messages=messages,
# stream=True,
# )
# partial_message = ""
# for part in response:
# content = part["choices"][0]["delta"].get("content", "")
# partial_message += content
# yield partial_message
# Function to generate responses
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
# chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
chat_prompt = system_prompt+"\n"
# Add history to the prompt
for user, bot in history:
chat_prompt += f"\n### Instruction:{user}\n\n### Response:{bot}\n"
# Add current message
chat_prompt += f"\n### Instruction:{message}\n\n### Response:"
print(chat_prompt)
response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
text = ""
for chunk in response:
content = chunk["choices"][0]["text"]
if content:
text += content
yield text
# JavaScript function for `on_load`
on_load = """
async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
"""
placeholder = """
<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
</center>
"""
# Create custom chat UI using `gr.Blocks`
with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
with gr.Column(scale=1, elem_id="center-content"):
gr.Markdown(
"""
<div style="text-align: center;">
<h1>Alif 1.0 Urdu & English Educator 🚀</h1>
<p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
</div>
""",
)
chat = gr.ChatInterface(
generate_response,
#chatbot=gr.Chatbot(placeholder=placeholder),
#title="🚀" + " " + "Alif-1.0 Chatbot",
#description="Urdu AI Chatbot powered by Llama.cpp",
examples=[
["شہر کراچی کے بارے میں بتاؤ"],
["قابل تجدید توانائی کیا ہے؟"],
["پاکستان کے بارے میں بتائیں"]
],
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Textbox(value="Below is an instruction that describes a task. Write a response that appropriately completes the request.", label="System prompt", render=False),
gr.Slider(0, 1, 0.8, label="Temperature", render=False),
gr.Slider(128, 8192, 4096, label="Max new tokens", render=False),
gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
],
)
demo.queue(max_size=10).launch(share=True)
|