Spaces:

traversaal-internal
/

Alif-1.0-8B-Model

Sleeping

App Files Files Community

alishafique commited on Feb 21

Commit

c4ef80f

verified ·

1 Parent(s): 6ca575f

Upload 2 files

Browse files

Files changed (2) hide show

Dockerfile (3) +48 -0
app (9).py +91 -0

Dockerfile (3) ADDED Viewed

	@@ -0,0 +1,48 @@

+ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
+FROM nvidia/cuda:${CUDA_IMAGE}
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    ocl-icd-opencl-dev opencl-headers clinfo \
+    libclblast-dev libopenblas-dev \
+    && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
+COPY . .
+# setting build related env
+# ENV CUDA_DOCKER_ARCH=all
+# ENV LLAMA_CUBLAS=1
+# Install depencencies
+RUN python3 -m pip install --upgrade pip pytest cmake \
+    scikit-build setuptools fastapi uvicorn sse-starlette \
+    pydantic-settings starlette-context gradio huggingface_hub hf_transfer
+# Install llama-cpp-python (build with cuda)
+# RUN CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=75" FORCE_CMAKE=1 python3 -m pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose
+RUN python3 -m pip install llama-cpp-python
+RUN useradd -m -u 1000 user
+# Switch to the "user" user
+USER user
+# Set home to the user's home directory
+ENV HOME=/home/user \
+	PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+	PYTHONUNBUFFERED=1 \
+	GRADIO_ALLOW_FLAGGING=never \
+	GRADIO_NUM_PORTS=1 \
+	GRADIO_SERVER_NAME=0.0.0.0 \
+	GRADIO_THEME=huggingface \
+	SYSTEM=spaces
+WORKDIR $HOME/app
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+CMD ["python3", "app.py"]

app (9).py ADDED Viewed

	@@ -0,0 +1,91 @@

+# import torch
+# print(torch.cuda.is_available())  # Should return True
+# print(torch.cuda.get_device_name(0))  # Should return 'Tesla T4'
+# print(torch.cuda.get_device_capability(0))
+import llama_cpp
+from llama_cpp import Llama
+# import llama_cpp.llama_tokenizer
+import gradio as gr
+from huggingface_hub import hf_hub_download
+model_name = "large-traversaal/Alif-1.0-8B-Instruct"
+model_file = "model-Q8_0.gguf"
+model_path_file = hf_hub_download(model_name,
+                             filename=model_file,)
+# llama = llama_cpp.Llama.from_pretrained(
+#     repo_id="large-traversaal/Alif-1.0-8B-Instruct",
+#     filename="*model-Q6_K.gguf",
+#     tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
+#         "large-traversaal/Alif-1.0-8B-Instruct"
+#     ),
+#     verbose=False,
+# )
+# llama = Llama(model_path="./model-Q8_0.gguf", verbose=False)
+llama = Llama(
+    model_path=model_path_file,
+    n_gpu_layers=40,  # Adjust based on VRAM
+    n_threads=8,  # Match CPU cores
+    n_batch=512,  # Optimize for better VRAM usage
+    n_ctx=4096,  # Context window size
+    verbose=True  # Enable debug logging
+)
+chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""
+# prompt = "قابل تجدید توانائی کیا ہے؟"
+prompt = "شہر کراچی کے بارے میں بتاؤ"
+# prompt = chat_prompt.format(inp=prompt)
+# response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming
+# # prompt = "قابل تجدید توانائی کیا ہے؟"
+# stop_tokens = ["\n\n", "<|end_of_text|>"]  # Stops after natural pauses or end-of-text token
+# Function to generate text with streaming output
+def chat_with_ai(prompt):
+    query = chat_prompt.format(inp=prompt)
+    #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True)  # Enable streaming
+    response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True)  # Enable streaming
+    # response = llama.create_chat_completion(
+    #     messages = [
+    #         {"role": "system", "content": "You are a Urdu Chatbot."},
+    #         {
+    #             "role": "user",
+    #             "content": prompt
+    #         }
+    #     ],
+    #     stream=True
+    # )
+    text = ""
+    for chunk in response:
+        content = chunk["choices"][0]["text"]
+        if content:
+            text += content
+            yield text
+# Gradio UI setup
+demo = gr.Interface(
+    fn=chat_with_ai,  # Streaming function
+    inputs="text",  # User input
+    outputs="text",  # Model response
+    title="💬 Streaming AI Chatbot",
+    description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)."
+)
+# Launch the Gradio app
+demo.launch(share=True)