alishafique commited on
Commit
62c7530
·
verified ·
1 Parent(s): 8315568

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -68
app.py CHANGED
@@ -1,34 +1,19 @@
1
- # import torch
2
- # print(torch.cuda.is_available()) # Should return True
3
- # print(torch.cuda.get_device_name(0)) # Should return 'Tesla T4'
4
- # print(torch.cuda.get_device_capability(0)
5
-
6
-
7
-
8
- import llama_cpp
9
- from llama_cpp import Llama
10
- # import llama_cpp.llama_tokenizer
11
  import gradio as gr
12
-
13
  from huggingface_hub import hf_hub_download
 
 
14
 
15
- model_name = "large-traversaal/Alif-1.0-8B-Instruct"
16
- model_file = "model-Q8_0.gguf"
17
- model_path_file = hf_hub_download(model_name,
18
- filename=model_file,)
19
-
20
- # llama = llama_cpp.Llama.from_pretrained(
21
- # repo_id="large-traversaal/Alif-1.0-8B-Instruct",
22
- # filename="*model-Q6_K.gguf",
23
- # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
24
- # "large-traversaal/Alif-1.0-8B-Instruct"
25
- # ),
26
- # verbose=False,
27
- # )
28
-
29
 
30
- # llama = Llama(model_path="./model-Q8_0.gguf", verbose=False)
31
 
 
32
  llama = Llama(
33
  model_path=model_path_file,
34
  n_gpu_layers=40, # Adjust based on VRAM
@@ -38,38 +23,13 @@ llama = Llama(
38
  verbose=True # Enable debug logging
39
  )
40
 
41
- chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""
42
-
43
- # prompt = "قابل تجدید توانائی کیا ہے؟"
44
- prompt = "شہر کراچی کے بارے میں بتاؤ"
45
-
46
- # prompt = chat_prompt.format(inp=prompt)
47
 
48
- # response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming
49
-
50
-
51
- # # prompt = "قابل تجدید توانائی کیا ہے؟"
52
- # stop_tokens = ["\n\n", "<|end_of_text|>"] # Stops after natural pauses or end-of-text token
53
-
54
-
55
- # Function to generate text with streaming output
56
- def chat_with_ai(prompt):
57
- query = chat_prompt.format(inp=prompt)
58
 
59
- #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True) # Enable streaming
60
- response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming
61
-
62
- # response = llama.create_chat_completion(
63
- # messages = [
64
- # {"role": "system", "content": "You are a Urdu Chatbot."},
65
- # {
66
- # "role": "user",
67
- # "content": prompt
68
- # }
69
- # ],
70
- # stream=True
71
- # )
72
-
73
  text = ""
74
  for chunk in response:
75
  content = chunk["choices"][0]["text"]
@@ -77,15 +37,72 @@ def chat_with_ai(prompt):
77
  text += content
78
  yield text
79
 
80
-
81
- # Gradio UI setup
82
- demo = gr.Interface(
83
- fn=chat_with_ai, # Streaming function
84
- inputs="text", # User input
85
- outputs="text", # Model response
86
- title="💬 Streaming AI Chatbot",
87
- description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)."
88
- )
89
-
90
- # Launch the Gradio app
91
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import subprocess
 
 
 
 
 
 
 
4
  import gradio as gr
5
+ from threading import Thread
6
  from huggingface_hub import hf_hub_download
7
+ from llama_cpp import Llama
8
+ from datetime import datetime
9
 
10
+ # Load model from Hugging Face Hub
11
+ MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
12
+ MODEL_FILE = "model-Q8_0.gguf"
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)
15
 
16
+ # Initialize Llama model
17
  llama = Llama(
18
  model_path=model_path_file,
19
  n_gpu_layers=40, # Adjust based on VRAM
 
23
  verbose=True # Enable debug logging
24
  )
25
 
 
 
 
 
 
 
26
 
27
+ # Function to generate responses
28
+ def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
29
+ # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
30
+ chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
31
+ response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
 
 
 
 
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  text = ""
34
  for chunk in response:
35
  content = chunk["choices"][0]["text"]
 
37
  text += content
38
  yield text
39
 
40
+ # def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
41
+ # """Generates a streaming response from the Llama model."""
42
+ # messages = [
43
+ # {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
44
+ # ]
45
+
46
+ # # Add history and the current message
47
+ # #for user, bot in history:
48
+ # #messages.append({"role": "user", "content": user})
49
+ # #messages.append({"role": "assistant", "content": bot})
50
+
51
+ # messages.append({"role": "user", "content": message})
52
+
53
+ # response = llama.create_chat_completion(
54
+ # messages=messages,
55
+ # stream=True,
56
+ # )
57
+
58
+ # partial_message = ""
59
+ # for part in response:
60
+ # content = part["choices"][0]["delta"].get("content", "")
61
+ # partial_message += content
62
+ # yield partial_message
63
+
64
+
65
+ # JavaScript function for `on_load`
66
+ on_load = """
67
+ async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
68
+ """
69
+
70
+ placeholder = """
71
+ <center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
72
+ </center>
73
+ """
74
+
75
+ # Create custom chat UI using `gr.Blocks`
76
+ with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
77
+ with gr.Column(scale=1, elem_id="center-content"):
78
+ gr.Markdown(
79
+ """
80
+ <div style="text-align: center;">
81
+ <h1>Alif 1.0 Urdu & English Chatbot 🚀</h1>
82
+ <p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
83
+ </div>
84
+ """,
85
+ )
86
+
87
+ chat = gr.ChatInterface(
88
+ generate_response,
89
+ #chatbot=gr.Chatbot(placeholder=placeholder),
90
+ #title="🚀" + " " + "Alif-1.0 Chatbot",
91
+ #description="Urdu AI Chatbot powered by Llama.cpp",
92
+ examples=[
93
+ ["شہر کراچی کے بارے میں بتاؤ"],
94
+ ["قابل تجدید توانائی کیا ہے؟"],
95
+ ["پاکستان کے بارے میں بتائیں"]
96
+ ],
97
+ additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
98
+ additional_inputs=[
99
+ gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu.", label="System prompt", render=False),
100
+ gr.Slider(0, 1, 0.8, label="Temperature", render=False),
101
+ gr.Slider(128, 4096, 512, label="Max new tokens", render=False),
102
+ gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
103
+ gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
104
+ gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
105
+ ],
106
+ )
107
+
108
+ demo.queue(max_size=10).launch(share=True)