# import torch | |
# print(torch.cuda.is_available()) # Should return True | |
# print(torch.cuda.get_device_name(0)) # Should return 'Tesla T4' | |
# print(torch.cuda.get_device_capability(0)) | |
import llama_cpp | |
from llama_cpp import Llama | |
# import llama_cpp.llama_tokenizer | |
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
model_name = "large-traversaal/Alif-1.0-8B-Instruct" | |
model_file = "model-Q8_0.gguf" | |
model_path_file = hf_hub_download(model_name, | |
filename=model_file,) | |
# llama = llama_cpp.Llama.from_pretrained( | |
# repo_id="large-traversaal/Alif-1.0-8B-Instruct", | |
# filename="*model-Q6_K.gguf", | |
# tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained( | |
# "large-traversaal/Alif-1.0-8B-Instruct" | |
# ), | |
# verbose=False, | |
# ) | |
# llama = Llama(model_path="./model-Q8_0.gguf", verbose=False) | |
llama = Llama( | |
model_path=model_path_file, | |
n_gpu_layers=40, # Adjust based on VRAM | |
n_threads=8, # Match CPU cores | |
n_batch=512, # Optimize for better VRAM usage | |
n_ctx=4096, # Context window size | |
verbose=True # Enable debug logging | |
) | |
chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:""" | |
# prompt = "قابل تجدید توانائی کیا ہے؟" | |
prompt = "شہر کراچی کے بارے میں بتاؤ" | |
# prompt = chat_prompt.format(inp=prompt) | |
# response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming | |
# # prompt = "قابل تجدید توانائی کیا ہے؟" | |
# stop_tokens = ["\n\n", "<|end_of_text|>"] # Stops after natural pauses or end-of-text token | |
# Function to generate text with streaming output | |
def chat_with_ai(prompt): | |
query = chat_prompt.format(inp=prompt) | |
#response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True) # Enable streaming | |
response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming | |
# response = llama.create_chat_completion( | |
# messages = [ | |
# {"role": "system", "content": "You are a Urdu Chatbot."}, | |
# { | |
# "role": "user", | |
# "content": prompt | |
# } | |
# ], | |
# stream=True | |
# ) | |
text = "" | |
for chunk in response: | |
content = chunk["choices"][0]["text"] | |
if content: | |
text += content | |
yield text | |
# Gradio UI setup | |
demo = gr.Interface( | |
fn=chat_with_ai, # Streaming function | |
inputs="text", # User input | |
outputs="text", # Model response | |
title="💬 Streaming AI Chatbot", | |
description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)." | |
) | |
# Launch the Gradio app | |
demo.launch(share=True) |