Spaces:

traversaal-internal
/

Alif-1.0-8B-Instruct

Sleeping

File size: 4,170 Bytes

import os
import json
import subprocess
import gradio as gr
from threading import Thread
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from datetime import datetime

# Load model from Hugging Face Hub
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
MODEL_FILE = "model-Q8_0.gguf"

model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)

# Initialize Llama model
llama = Llama(
    model_path=model_path_file,
    n_gpu_layers=40,  # Adjust based on VRAM
    n_threads=8,  # Match CPU cores
    n_batch=512,  # Optimize for better VRAM usage
    n_ctx=4096,  # Context window size
    verbose=True  # Enable debug logging
)

CHAT_TEMPLATE = "Alif Chat"
CONTEXT_LENGTH = 4096
COLOR = "blue"
EMOJI = "💬"
DESCRIPTION = "Urdu AI Chatbot powered by Llama.cpp"

# Function to generate responses
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
    chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
    response = llama(chat_prompt, max_tokens=max_new_tokens, stop=["Q:", "\n"], echo=False, stream=True)
    
    text = ""
    for chunk in response:
        content = chunk["choices"][0]["text"]
        if content:
            text += content
            yield text

# Create Gradio interface
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Urdu Chatbot", likeable=True, render=False)
    chat = gr.ChatInterface(
        generate_response,
        chatbot=chatbot,
        title=EMOJI + " " + "Alif-1.0 Chatbot",
        description=DESCRIPTION,
        examples=[
            ["شہر کراچی کے بارے میں بتاؤ"],
            ["قابل تجدید توانائی کیا ہے؟"],
            ["پاکستان کی تاریخ کے بارے میں بتائیں۔"]
        ],
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox("", label="System prompt", render=False),
            gr.Slider(0, 1, 0.6, label="Temperature", render=False),
            gr.Slider(128, CONTEXT_LENGTH, 1024, label="Max new tokens", render=False),
            gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
            gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
            gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
        ],
        theme=gr.themes.Soft(primary_hue=COLOR),
    )

demo.queue(max_size=20).launch(share=True)


# import llama_cpp
# from llama_cpp import Llama
# # import llama_cpp.llama_tokenizer
# import gradio as gr

# from huggingface_hub import hf_hub_download

# model_name = "large-traversaal/Alif-1.0-8B-Instruct"
# model_file = "model-Q8_0.gguf"
# model_path_file = hf_hub_download(model_name,
#                              filename=model_file,)


# llama = Llama(
#     model_path=model_path_file,
#     n_gpu_layers=40,  # Adjust based on VRAM
#     n_threads=8,  # Match CPU cores
#     n_batch=512,  # Optimize for better VRAM usage
#     n_ctx=4096,  # Context window size
#     verbose=True  # Enable debug logging
# )

# chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""

# # Function to generate text with streaming output
# def chat_with_ai(prompt):
#     query = chat_prompt.format(inp=prompt)
    
#     #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True)  # Enable streaming
#     response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming

#     text = ""
#     for chunk in response:
#         content = chunk["choices"][0]["text"]
#         if content:
#             text += content
#             yield text


# # Gradio UI setup
# demo = gr.Interface(
#     fn=chat_with_ai,  # Streaming function
#     inputs="text",  # User input
#     outputs="text",  # Model response
#     title="Streaming Alif-1.0-8B-Instruct Chatbot 🚀",
#     description="Enter a prompt and get a streamed response."
# )

# # Launch the Gradio app
# demo.launch(share=True)