alishafique commited on
Commit
c4ef80f
·
verified ·
1 Parent(s): 6ca575f

Upload 2 files

Browse files
Files changed (2) hide show
  1. Dockerfile (3) +48 -0
  2. app (9).py +91 -0
Dockerfile (3) ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
2
+ FROM nvidia/cuda:${CUDA_IMAGE}
3
+
4
+ # We need to set the host to 0.0.0.0 to allow outside access
5
+ ENV HOST 0.0.0.0
6
+
7
+ RUN apt-get update && apt-get upgrade -y \
8
+ && apt-get install -y git build-essential \
9
+ python3 python3-pip gcc wget \
10
+ ocl-icd-opencl-dev opencl-headers clinfo \
11
+ libclblast-dev libopenblas-dev \
12
+ && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
13
+
14
+ COPY . .
15
+
16
+ # setting build related env
17
+ # ENV CUDA_DOCKER_ARCH=all
18
+ # ENV LLAMA_CUBLAS=1
19
+
20
+ # Install depencencies
21
+ RUN python3 -m pip install --upgrade pip pytest cmake \
22
+ scikit-build setuptools fastapi uvicorn sse-starlette \
23
+ pydantic-settings starlette-context gradio huggingface_hub hf_transfer
24
+
25
+ # Install llama-cpp-python (build with cuda)
26
+ # RUN CMAKE_ARGS="-DGGML_CUDA=on -DCMAKE_CUDA_ARCHITECTURES=75" FORCE_CMAKE=1 python3 -m pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir --verbose
27
+ RUN python3 -m pip install llama-cpp-python
28
+
29
+ RUN useradd -m -u 1000 user
30
+ # Switch to the "user" user
31
+ USER user
32
+ # Set home to the user's home directory
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:$PATH \
35
+ PYTHONPATH=$HOME/app \
36
+ PYTHONUNBUFFERED=1 \
37
+ GRADIO_ALLOW_FLAGGING=never \
38
+ GRADIO_NUM_PORTS=1 \
39
+ GRADIO_SERVER_NAME=0.0.0.0 \
40
+ GRADIO_THEME=huggingface \
41
+ SYSTEM=spaces
42
+
43
+ WORKDIR $HOME/app
44
+
45
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
46
+ COPY --chown=user . $HOME/app
47
+
48
+ CMD ["python3", "app.py"]
app (9).py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import torch
2
+ # print(torch.cuda.is_available()) # Should return True
3
+ # print(torch.cuda.get_device_name(0)) # Should return 'Tesla T4'
4
+ # print(torch.cuda.get_device_capability(0))
5
+
6
+
7
+
8
+ import llama_cpp
9
+ from llama_cpp import Llama
10
+ # import llama_cpp.llama_tokenizer
11
+ import gradio as gr
12
+
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ model_name = "large-traversaal/Alif-1.0-8B-Instruct"
16
+ model_file = "model-Q8_0.gguf"
17
+ model_path_file = hf_hub_download(model_name,
18
+ filename=model_file,)
19
+
20
+ # llama = llama_cpp.Llama.from_pretrained(
21
+ # repo_id="large-traversaal/Alif-1.0-8B-Instruct",
22
+ # filename="*model-Q6_K.gguf",
23
+ # tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
24
+ # "large-traversaal/Alif-1.0-8B-Instruct"
25
+ # ),
26
+ # verbose=False,
27
+ # )
28
+
29
+
30
+ # llama = Llama(model_path="./model-Q8_0.gguf", verbose=False)
31
+
32
+ llama = Llama(
33
+ model_path=model_path_file,
34
+ n_gpu_layers=40, # Adjust based on VRAM
35
+ n_threads=8, # Match CPU cores
36
+ n_batch=512, # Optimize for better VRAM usage
37
+ n_ctx=4096, # Context window size
38
+ verbose=True # Enable debug logging
39
+ )
40
+
41
+ chat_prompt = """You are Urdu Chatbot. Write approriate response for given instruction:{inp} Response:"""
42
+
43
+ # prompt = "قابل تجدید توانائی کیا ہے؟"
44
+ prompt = "شہر کراچی کے بارے میں بتاؤ"
45
+
46
+ # prompt = chat_prompt.format(inp=prompt)
47
+
48
+ # response = llama(prompt, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming
49
+
50
+
51
+ # # prompt = "قابل تجدید توانائی کیا ہے؟"
52
+ # stop_tokens = ["\n\n", "<|end_of_text|>"] # Stops after natural pauses or end-of-text token
53
+
54
+
55
+ # Function to generate text with streaming output
56
+ def chat_with_ai(prompt):
57
+ query = chat_prompt.format(inp=prompt)
58
+
59
+ #response = llama(prompt, max_tokens=1024, stop=stop_tokens, echo=False, stream=True) # Enable streaming
60
+ response = llama(query, max_tokens=256, stop=["Q:", "\n"], echo=False, stream=True) # Enable streaming
61
+
62
+ # response = llama.create_chat_completion(
63
+ # messages = [
64
+ # {"role": "system", "content": "You are a Urdu Chatbot."},
65
+ # {
66
+ # "role": "user",
67
+ # "content": prompt
68
+ # }
69
+ # ],
70
+ # stream=True
71
+ # )
72
+
73
+ text = ""
74
+ for chunk in response:
75
+ content = chunk["choices"][0]["text"]
76
+ if content:
77
+ text += content
78
+ yield text
79
+
80
+
81
+ # Gradio UI setup
82
+ demo = gr.Interface(
83
+ fn=chat_with_ai, # Streaming function
84
+ inputs="text", # User input
85
+ outputs="text", # Model response
86
+ title="💬 Streaming AI Chatbot",
87
+ description="Enter a prompt and get a streamed response from Llama.cpp (GGUF)."
88
+ )
89
+
90
+ # Launch the Gradio app
91
+ demo.launch(share=True)