File size: 2,798 Bytes
c4ef80f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# import torch
# print(torch.cuda.is_available())  # Should return True
# print(torch.cuda.get_device_name(0))  # Should return 'Tesla T4'
# print(torch.cuda.get_device_capability(0))  



import llama_cpp
from llama_cpp import Llama
# import llama_cpp.llama_tokenizer
import gradio as gr

from huggingface_hub import hf_hub_download

model_name = "large-traversaal/Alif-1.0-8B-Instruct"
model_file = "model-Q8_0.gguf"
model_path_file = hf_hub_download(model_name,
                             filename=model_file,)

# llama = llama_cpp.Llama.from_pretrained(
#     repo_id="large-traversaal/Alif-1.0-8B-Instruct",
#     filename="*model-Q6_K.gguf",
#     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
#         "large-traversaal/Alif-1.0-8B-Instruct"
#     ),
#     verbose=False,
# )


# llama = Llama(model_path="./model-Q8_0.gguf", verbose=False) 

llama = Llama(
    model_path=model_path_file,
    n_gpu_layers=40,  # Adjust based on VRAM
    n_threads=8,  # Match CPU cores
    n_batch=512,  # Optimize for better VRAM usage
    n_ctx=4096,  # Context window size
    verbose=True  # Enable debug logging
)

chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""

# prompt = "قابل تجدید توانائی کیا ہے؟"
prompt = "شہر کراچی کے بارے میں بتاؤ"

# prompt = chat_prompt.format(inp=prompt)

# response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming


# # prompt = "قابل تجدید توانائی کیا ہے؟"
# stop_tokens = ["\n\n", "<|end_of_text|>"]  # Stops after natural pauses or end-of-text token


# Function to generate text with streaming output
def chat_with_ai(prompt):
    query = chat_prompt.format(inp=prompt)
    
    #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True)  # Enable streaming
    response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming

    # response = llama.create_chat_completion(
    #     messages = [
    #         {"role": "system", "content": "You are a Urdu Chatbot."},
    #         {
    #             "role": "user",
    #             "content": prompt
    #         }
    #     ],
    #     stream=True
    # )

    text = ""
    for chunk in response:
        content = chunk["choices"][0]["text"]
        if content:
            text += content
            yield text


# Gradio UI setup
demo = gr.Interface(
    fn=chat_with_ai,  # Streaming function
    inputs="text",  # User input
    outputs="text",  # Model response
    title="💬 Streaming AI Chatbot",
    description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)."
)

# Launch the Gradio app
demo.launch(share=True)