Shawn732 commited on
Commit
7e9c0cb
·
1 Parent(s): 8aa15e7
Files changed (2) hide show
  1. Dockerfile.backup +61 -0
  2. app.py +91 -0
Dockerfile.backup ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an NVIDIA CUDA base image
2
+ ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
3
+ FROM nvidia/cuda:${CUDA_IMAGE}
4
+
5
+
6
+ ENV HOST 0.0.0.0
7
+
8
+ # Set the working directory in the container to /app
9
+ #WORKDIR /app
10
+
11
+ RUN mkdir -p /app/cache && chmod -R 777 /app/cache
12
+
13
+ ENV HF_HOME=/app/cache
14
+
15
+ # Install Python and pip
16
+ RUN apt-get update && apt-get upgrade -y \
17
+ && apt-get install -y git build-essential \
18
+ python3 python3-pip gcc wget \
19
+ ocl-icd-opencl-dev opencl-headers clinfo \
20
+ libclblast-dev libopenblas-dev \
21
+ && mkdir -p /etc/OpenCL/vendors && echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd
22
+
23
+ ENV CUDA_DOCKER_ARCH=all
24
+ ENV LLAMA_CUBLAS=1
25
+
26
+ # Copy the current directory contents into the container at /app
27
+ COPY . /app
28
+
29
+ # Install required packages from requirements.txt
30
+ COPY ./requirements.txt /app/requirements.txt
31
+ RUN pip3 install --no-cache-dir -r /app/requirements.txt
32
+
33
+ # Expose the ports for FastAPI and Streamlit
34
+ EXPOSE 8000
35
+ EXPOSE 8501
36
+
37
+ # Copy and give execute permissions to the start script
38
+ COPY start_server.sh /app/start_server.sh
39
+ RUN chmod +x /app/start_server.sh
40
+
41
+ RUN useradd -m -u 1000 user
42
+ # Switch to the "user" user
43
+ USER user
44
+ WORKDIR /home/user/app
45
+ # Set home to the user's home directory
46
+ ENV HOME=/home/user \
47
+ PATH=/home/user/.local/bin:$PATH \
48
+ PYTHONPATH=$HOME/app \
49
+ PYTHONUNBUFFERED=1 \
50
+ GRADIO_ALLOW_FLAGGING=never \
51
+ GRADIO_NUM_PORTS=1 \
52
+ GRADIO_SERVER_NAME=0.0.0.0 \
53
+ GRADIO_THEME=huggingface \
54
+ SYSTEM=spaces
55
+
56
+ # Copy the current directory contents into the container at $HOME/app setting the owner to the user
57
+ COPY --chown=user . /home/user/app
58
+
59
+ # Run the start script
60
+ #CMD ["/app/start_server.sh"]
61
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
app.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ #import gradio as gr
3
+ import copy
4
+ import time
5
+ import llama_cpp
6
+ from llama_cpp import Llama
7
+ from huggingface_hub import hf_hub_download
8
+ from fastapi import FastAPI, Request
9
+ from fastapi.middleware.cors import CORSMiddleware
10
+ import nest_asyncio
11
+ import uvicorn
12
+
13
+ app = FastAPI()
14
+ MODEL_NAME = "TheBloke/CodeLlama-7B-GGUF"
15
+ PDF_PATH = "/opt/docs"
16
+ CLASSIFIER_MODEL_NAME = "roberta-large-mnli"
17
+
18
+ # Add CORS middleware
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=['*'],
22
+ allow_credentials=True,
23
+ allow_methods=['*'],
24
+ allow_headers=['*'],
25
+ )
26
+
27
+ llm = Llama(
28
+ model_path=hf_hub_download(
29
+ repo_id=os.environ.get("REPO_ID", MODEL_NAME),
30
+ filename=os.environ.get("MODEL_FILE", "codellama-7b.Q8_0.gguf"),
31
+ ),
32
+ n_ctx=2048,
33
+ n_gpu_layers=50, # change n_gpu_layers if you have more or less VRAM
34
+ )
35
+
36
+ history = []
37
+
38
+ system_message = """
39
+ You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
40
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
41
+ """
42
+
43
+ @app.post("/predict")
44
+ def generate_text(message, history):
45
+ temp = ""
46
+ input_prompt = f"[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n "
47
+ for interaction in history:
48
+ input_prompt = input_prompt + str(interaction[0]) + " [/INST] " + str(interaction[1]) + " </s><s> [INST] "
49
+
50
+ input_prompt = input_prompt + str(message) + " [/INST] "
51
+
52
+ output = llm(
53
+ input_prompt,
54
+ temperature=0.15,
55
+ top_p=0.1,
56
+ top_k=40,
57
+ repeat_penalty=1.1,
58
+ max_tokens=1024,
59
+ stop=[
60
+ "<|prompter|>",
61
+ "<|endoftext|>",
62
+ "<|endoftext|> \n",
63
+ "ASSISTANT:",
64
+ "USER:",
65
+ "SYSTEM:",
66
+ ],
67
+ stream=True,
68
+ )
69
+ for out in output:
70
+ stream = copy.deepcopy(out)
71
+ temp += stream["choices"][0]["text"]
72
+ yield temp
73
+
74
+ history = ["init", input_prompt]
75
+
76
+
77
+ # demo = gr.ChatInterface(
78
+ # generate_text,
79
+ # title="llama-cpp-python on GPU",
80
+ # description="Running LLM with https://github.com/abetlen/llama-cpp-python",
81
+ # examples=["tell me everything about llamas"],
82
+ # cache_examples=True,
83
+ # retry_btn=None,
84
+ # undo_btn="Delete Previous",
85
+ # clear_btn="Clear",
86
+ # )
87
+ # demo.queue(concurrency_count=1, max_size=5)
88
+ # demo.launch()
89
+
90
+ nest_asyncio.apply()
91
+ uvicorn.run(app, port=8000)