alishafique commited on
Commit
930a613
·
verified ·
1 Parent(s): 5bbb4d1

Upload app (6).py

Browse files
Files changed (1) hide show
  1. app (6).py +91 -0
app (6).py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # print(torch.cuda.is_available()) # Should return True
3
+ # print(torch.cuda.get_device_name(0)) # Should return 'Tesla T4'
4
+ # print(torch.cuda.get_device_capability(0)) # Should return (7, 5)
5
+
6
+
7
+
8
+ import llama_cpp
9
+ from llama_cpp import Llama
10
+ # import llama_cpp.llama_tokenizer
11
+ import gradio as gr
12
+
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ model_name = "large-traversaal/Alif-1.0-8B-Instruct"
16
+ model_file = "model-Q8_0.gguf"
17
+ model_path_file = hf_hub_download(model_name,
18
+ filename=model_file,)
19
+
20
+ # llama = llama_cpp.Llama.from_pretrained(
21
+ # repo_id="large-traversaal/Alif-1.0-8B-Instruct",
22
+ # filename="*model-Q6_K.gguf",
23
+ # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
24
+ # "large-traversaal/Alif-1.0-8B-Instruct"
25
+ # ),
26
+ # verbose=False,
27
+ # )
28
+
29
+
30
+ # llama = Llama(model_path="./model-Q8_0.gguf", verbose=False)
31
+
32
+ llama = Llama(
33
+ model_path=model_path_file,
34
+ n_gpu_layers=40, # Adjust based on VRAM
35
+ n_threads=8, # Match CPU cores
36
+ n_batch=512, # Optimize for better VRAM usage
37
+ n_ctx=4096, # Context window size
38
+ verbose=True # Enable debug logging
39
+ )
40
+
41
+ chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""
42
+
43
+ # prompt = "قابل تجدید توانائی کیا ہے؟"
44
+ prompt = "شہر کراچی کے بارے میں بتاؤ"
45
+
46
+ # prompt = chat_prompt.format(inp=prompt)
47
+
48
+ # response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming
49
+
50
+
51
+ # # prompt = "قابل تجدید توانائی کیا ہے؟"
52
+ # stop_tokens = ["\n\n", "<|end_of_text|>"] # Stops after natural pauses or end-of-text token
53
+
54
+
55
+ # Function to generate text with streaming output
56
+ def chat_with_ai(prompt):
57
+ query = chat_prompt.format(inp=prompt)
58
+
59
+ #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True) # Enable streaming
60
+ response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming
61
+
62
+ # response = llama.create_chat_completion(
63
+ # messages = [
64
+ # {"role": "system", "content": "You are a Urdu Chatbot."},
65
+ # {
66
+ # "role": "user",
67
+ # "content": prompt
68
+ # }
69
+ # ],
70
+ # stream=True
71
+ # )
72
+
73
+ text = ""
74
+ for chunk in response:
75
+ content = chunk["choices"][0]["text"]
76
+ if content:
77
+ text += content
78
+ yield text
79
+
80
+
81
+ # Gradio UI setup
82
+ demo = gr.Interface(
83
+ fn=chat_with_ai, # Streaming function
84
+ inputs="text", # User input
85
+ outputs="text", # Model response
86
+ title="💬 Streaming AI Chatbot",
87
+ description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)."
88
+ )
89
+
90
+ # Launch the Gradio app
91
+ demo.launch(share=True)