Spaces:

traversaal-internal
/

Alif-1.0-8B-Instruct

Sleeping

App Files Files Community

alishafique commited on Feb 21

Commit

930a613

verified ·

1 Parent(s): 5bbb4d1

Upload app (6).py

Browse files

Files changed (1) hide show

app (6).py +91 -0

app (6).py ADDED Viewed

	@@ -0,0 +1,91 @@

+# import torch
+# print(torch.cuda.is_available())  # Should return True
+# print(torch.cuda.get_device_name(0))  # Should return 'Tesla T4'
+# print(torch.cuda.get_device_capability(0))  # Should return (7, 5)
+import llama_cpp
+from llama_cpp import Llama
+# import llama_cpp.llama_tokenizer
+import gradio as gr
+from huggingface_hub import hf_hub_download
+model_name = "large-traversaal/Alif-1.0-8B-Instruct"
+model_file = "model-Q8_0.gguf"
+model_path_file = hf_hub_download(model_name,
+                             filename=model_file,)
+# llama = llama_cpp.Llama.from_pretrained(
+#     repo_id="large-traversaal/Alif-1.0-8B-Instruct",
+#     filename="*model-Q6_K.gguf",
+#     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+#         "large-traversaal/Alif-1.0-8B-Instruct"
+#     ),
+#     verbose=False,
+# )
+# llama = Llama(model_path="./model-Q8_0.gguf", verbose=False)
+llama = Llama(
+    model_path=model_path_file,
+    n_gpu_layers=40,  # Adjust based on VRAM
+    n_threads=8,  # Match CPU cores
+    n_batch=512,  # Optimize for better VRAM usage
+    n_ctx=4096,  # Context window size
+    verbose=True  # Enable debug logging
+)
+chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""
+# prompt = "قابل تجدید توانائی کیا ہے؟"
+prompt = "شہر کراچی کے بارے میں بتاؤ"
+# prompt = chat_prompt.format(inp=prompt)
+# response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming
+# # prompt = "قابل تجدید توانائی کیا ہے؟"
+# stop_tokens = ["\n\n", "<|end_of_text|>"]  # Stops after natural pauses or end-of-text token
+# Function to generate text with streaming output
+def chat_with_ai(prompt):
+    query = chat_prompt.format(inp=prompt)
+    #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True)  # Enable streaming
+    response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming
+    # response = llama.create_chat_completion(
+    #     messages = [
+    #         {"role": "system", "content": "You are a Urdu Chatbot."},
+    #         {
+    #             "role": "user",
+    #             "content": prompt
+    #         }
+    #     ],
+    #     stream=True
+    # )
+    text = ""
+    for chunk in response:
+        content = chunk["choices"][0]["text"]
+        if content:
+            text += content
+            yield text
+# Gradio UI setup
+demo = gr.Interface(
+    fn=chat_with_ai,  # Streaming function
+    inputs="text",  # User input
+    outputs="text",  # Model response
+    title="💬 Streaming AI Chatbot",
+    description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)."
+)
+# Launch the Gradio app
+demo.launch(share=True)