File size: 5,418 Bytes
f88a5b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f689f0
6936b25
 
 
 
f88a5b9
6936b25
 
 
 
 
 
f88a5b9
a28de04
 
 
 
 
f88a5b9
a28de04
 
 
 
f88a5b9
a28de04
f88a5b9
a28de04
 
 
 
f88a5b9
a28de04
 
 
 
 
 
 
d497e2a
a28de04
5211f32
ecb4862
5211f32
a28de04
 
d497e2a
 
a28de04
d497e2a
 
 
a28de04
d497e2a
a28de04
 
 
 
 
 
 
f88a5b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5fdb51d
f88a5b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef264c8
f88a5b9
5fdb51d
f88a5b9
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import os
import json
import subprocess
import gradio as gr
from threading import Thread
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from datetime import datetime

# Load model from Hugging Face Hub
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
MODEL_FILE = "model-Q8_0.gguf"

model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)

# Initialize Llama model
llama = Llama(
    model_path=model_path_file,
    n_gpu_layers=40,  # Adjust based on VRAM
    n_threads=8,  # Match CPU cores
    n_batch=512,  # Optimize for better VRAM usage
    n_ctx=4096,  # Context window size
    verbose=True  # Enable debug logging
)


## Function to generate responses
# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
#     # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
#     chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
#     response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
    
#     text = ""
#     for chunk in response:
#         content = chunk["choices"][0]["text"]
#         if content:
#             text += content
#             yield text

# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
#     """Generates a streaming response from the Llama model."""
#     messages = [
#         {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
#     ]

#     # Add history and the current message
#     for user, bot in history:
#         messages.append({"role": "user", "content": user})
#         messages.append({"role": "assistant", "content": bot})

#     messages.append({"role": "user", "content": message})

#     response = llama.create_chat_completion(
#         messages=messages,
#         stream=True,
#     )

#     partial_message = ""
#     for part in response:
#         content = part["choices"][0]["delta"].get("content", "")
#         partial_message += content
#         yield partial_message


# Function to generate responses
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
    # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
    chat_prompt = system_prompt+"\n"
    
    # Add history to the prompt
    for user, bot in history:
        chat_prompt += f"\n### Instruction:{user}\n\n### Response:{bot}\n"
        
    # Add current message
    chat_prompt += f"\n### Instruction:{message}\n\n### Response:"

    print(chat_prompt)
    
    response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
    
    text = ""
    for chunk in response:
        content = chunk["choices"][0]["text"]
        if content:
            text += content
            yield text


# JavaScript function for `on_load`
on_load = """
async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
"""

placeholder = """
<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
</center>
"""

# Create custom chat UI using `gr.Blocks`
with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
    with gr.Column(scale=1, elem_id="center-content"):
        gr.Markdown(
            """
            <div style="text-align: center;">
                <h1>Alif 1.0 Urdu & English Educator 🚀</h1>
                <p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
            </div>
            """,
        )
    
    chat = gr.ChatInterface(
        generate_response,
        #chatbot=gr.Chatbot(placeholder=placeholder),
        #title="🚀" + " " + "Alif-1.0 Chatbot",
        #description="Urdu AI Chatbot powered by Llama.cpp",
        examples=[
            ["شہر کراچی کے بارے میں بتاؤ"],
            ["قابل تجدید توانائی کیا ہے؟"],
            ["پاکستان کے بارے میں بتائیں"]
        ],
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(value="Below is an instruction that describes a task. Write a response that appropriately completes the request.", label="System prompt", render=False),
            gr.Slider(0, 1, 0.8, label="Temperature", render=False),
            gr.Slider(128, 8192, 4096, label="Max new tokens", render=False),
            gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
            gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
            gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
        ],
    )

demo.queue(max_size=10).launch(share=True)