File size: 4,170 Bytes
4a6b784
 
 
930a613
4a6b784
930a613
4a6b784
 
930a613
4a6b784
 
 
930a613
4a6b784
930a613
4a6b784
930a613
 
 
 
 
 
 
 
 
4a6b784
 
 
 
 
930a613
4a6b784
 
 
 
930a613
 
 
 
 
 
 
 
4a6b784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930a613
4a6b784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930a613
4a6b784
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import os
import json
import subprocess
import gradio as gr
from threading import Thread
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from datetime import datetime

# Load model from Hugging Face Hub
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
MODEL_FILE = "model-Q8_0.gguf"

model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)

# Initialize Llama model
llama = Llama(
    model_path=model_path_file,
    n_gpu_layers=40,  # Adjust based on VRAM
    n_threads=8,  # Match CPU cores
    n_batch=512,  # Optimize for better VRAM usage
    n_ctx=4096,  # Context window size
    verbose=True  # Enable debug logging
)

CHAT_TEMPLATE = "Alif Chat"
CONTEXT_LENGTH = 4096
COLOR = "blue"
EMOJI = "💬"
DESCRIPTION = "Urdu AI Chatbot powered by Llama.cpp"

# Function to generate responses
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
    chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
    response = llama(chat_prompt, max_tokens=max_new_tokens, stop=["Q:", "\n"], echo=False, stream=True)
    
    text = ""
    for chunk in response:
        content = chunk["choices"][0]["text"]
        if content:
            text += content
            yield text

# Create Gradio interface
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Urdu Chatbot", likeable=True, render=False)
    chat = gr.ChatInterface(
        generate_response,
        chatbot=chatbot,
        title=EMOJI + " " + "Alif-1.0 Chatbot",
        description=DESCRIPTION,
        examples=[
            ["شہر کراچی کے بارے میں بتاؤ"],
            ["قابل تجدید توانائی کیا ہے؟"],
            ["پاکستان کی تاریخ کے بارے میں بتائیں۔"]
        ],
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox("", label="System prompt", render=False),
            gr.Slider(0, 1, 0.6, label="Temperature", render=False),
            gr.Slider(128, CONTEXT_LENGTH, 1024, label="Max new tokens", render=False),
            gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
            gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
            gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
        ],
        theme=gr.themes.Soft(primary_hue=COLOR),
    )

demo.queue(max_size=20).launch(share=True)


# import llama_cpp
# from llama_cpp import Llama
# # import llama_cpp.llama_tokenizer
# import gradio as gr

# from huggingface_hub import hf_hub_download

# model_name = "large-traversaal/Alif-1.0-8B-Instruct"
# model_file = "model-Q8_0.gguf"
# model_path_file = hf_hub_download(model_name,
#                              filename=model_file,)


# llama = Llama(
#     model_path=model_path_file,
#     n_gpu_layers=40,  # Adjust based on VRAM
#     n_threads=8,  # Match CPU cores
#     n_batch=512,  # Optimize for better VRAM usage
#     n_ctx=4096,  # Context window size
#     verbose=True  # Enable debug logging
# )

# chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""

# # Function to generate text with streaming output
# def chat_with_ai(prompt):
#     query = chat_prompt.format(inp=prompt)
    
#     #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True)  # Enable streaming
#     response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming

#     text = ""
#     for chunk in response:
#         content = chunk["choices"][0]["text"]
#         if content:
#             text += content
#             yield text


# # Gradio UI setup
# demo = gr.Interface(
#     fn=chat_with_ai,  # Streaming function
#     inputs="text",  # User input
#     outputs="text",  # Model response
#     title="Streaming Alif-1.0-8B-Instruct Chatbot 🚀",
#     description="Enter a prompt and get a streamed response."
# )

# # Launch the Gradio app
# demo.launch(share=True)