Christoph Holthaus
commited on
Commit
·
2ae746c
1
Parent(s):
ab2d9f0
took alot from https://huggingface.co/spaces/imperialwool/llama-cpp-api/blob/main
Browse files- .gitignore +162 -0
- Dockerfile +26 -65
- gradio_app.py +86 -0
- requirements.txt +2 -0
.gitignore
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# BASIC PYTHON .GITIGNORE + some for testing
|
2 |
+
|
3 |
+
# Byte-compiled / optimized / DLL files
|
4 |
+
__pycache__/
|
5 |
+
*.py[cod]
|
6 |
+
*$py.class
|
7 |
+
|
8 |
+
# C extensions
|
9 |
+
*.so
|
10 |
+
|
11 |
+
# Distribution / packaging
|
12 |
+
.Python
|
13 |
+
build/
|
14 |
+
develop-eggs/
|
15 |
+
dist/
|
16 |
+
downloads/
|
17 |
+
eggs/
|
18 |
+
.eggs/
|
19 |
+
lib/
|
20 |
+
lib64/
|
21 |
+
parts/
|
22 |
+
sdist/
|
23 |
+
var/
|
24 |
+
wheels/
|
25 |
+
share/python-wheels/
|
26 |
+
*.egg-info/
|
27 |
+
.installed.cfg
|
28 |
+
*.egg
|
29 |
+
MANIFEST
|
30 |
+
|
31 |
+
# PyInstaller
|
32 |
+
# Usually these files are written by a python script from a template
|
33 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
34 |
+
*.manifest
|
35 |
+
*.spec
|
36 |
+
|
37 |
+
# Installer logs
|
38 |
+
pip-log.txt
|
39 |
+
pip-delete-this-directory.txt
|
40 |
+
|
41 |
+
# Unit test / coverage reports
|
42 |
+
htmlcov/
|
43 |
+
.tox/
|
44 |
+
.nox/
|
45 |
+
.coverage
|
46 |
+
.coverage.*
|
47 |
+
.cache
|
48 |
+
nosetests.xml
|
49 |
+
coverage.xml
|
50 |
+
*.cover
|
51 |
+
*.py,cover
|
52 |
+
.hypothesis/
|
53 |
+
.pytest_cache/
|
54 |
+
cover/
|
55 |
+
|
56 |
+
# Translations
|
57 |
+
*.mo
|
58 |
+
*.pot
|
59 |
+
|
60 |
+
# Django stuff:
|
61 |
+
*.log
|
62 |
+
local_settings.py
|
63 |
+
db.sqlite3
|
64 |
+
db.sqlite3-journal
|
65 |
+
|
66 |
+
# Flask stuff:
|
67 |
+
instance/
|
68 |
+
.webassets-cache
|
69 |
+
|
70 |
+
# Scrapy stuff:
|
71 |
+
.scrapy
|
72 |
+
|
73 |
+
# Sphinx documentation
|
74 |
+
docs/_build/
|
75 |
+
|
76 |
+
# PyBuilder
|
77 |
+
.pybuilder/
|
78 |
+
target/
|
79 |
+
|
80 |
+
# Jupyter Notebook
|
81 |
+
.ipynb_checkpoints
|
82 |
+
|
83 |
+
# IPython
|
84 |
+
profile_default/
|
85 |
+
ipython_config.py
|
86 |
+
|
87 |
+
# pyenv
|
88 |
+
# For a library or package, you might want to ignore these files since the code is
|
89 |
+
# intended to run in multiple environments; otherwise, check them in:
|
90 |
+
# .python-version
|
91 |
+
|
92 |
+
# pipenv
|
93 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
94 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
95 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
96 |
+
# install all needed dependencies.
|
97 |
+
#Pipfile.lock
|
98 |
+
|
99 |
+
# poetry
|
100 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
101 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
102 |
+
# commonly ignored for libraries.
|
103 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
104 |
+
#poetry.lock
|
105 |
+
|
106 |
+
# pdm
|
107 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
108 |
+
#pdm.lock
|
109 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
110 |
+
# in version control.
|
111 |
+
# https://pdm.fming.dev/#use-with-ide
|
112 |
+
.pdm.toml
|
113 |
+
|
114 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
115 |
+
__pypackages__/
|
116 |
+
|
117 |
+
# Celery stuff
|
118 |
+
celerybeat-schedule
|
119 |
+
celerybeat.pid
|
120 |
+
|
121 |
+
# SageMath parsed files
|
122 |
+
*.sage.py
|
123 |
+
|
124 |
+
# Environments
|
125 |
+
.env
|
126 |
+
.venv
|
127 |
+
env/
|
128 |
+
venv/
|
129 |
+
ENV/
|
130 |
+
env.bak/
|
131 |
+
venv.bak/
|
132 |
+
|
133 |
+
# Spyder project settings
|
134 |
+
.spyderproject
|
135 |
+
.spyproject
|
136 |
+
|
137 |
+
# Rope project settings
|
138 |
+
.ropeproject
|
139 |
+
|
140 |
+
# mkdocs documentation
|
141 |
+
/site
|
142 |
+
|
143 |
+
# mypy
|
144 |
+
.mypy_cache/
|
145 |
+
.dmypy.json
|
146 |
+
dmypy.json
|
147 |
+
|
148 |
+
# Pyre type checker
|
149 |
+
.pyre/
|
150 |
+
|
151 |
+
# pytype static type analyzer
|
152 |
+
.pytype/
|
153 |
+
|
154 |
+
# Cython debug symbols
|
155 |
+
cython_debug/
|
156 |
+
|
157 |
+
# PyCharm
|
158 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
159 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
160 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
161 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
162 |
+
#.idea/
|
Dockerfile
CHANGED
@@ -1,73 +1,34 @@
|
|
1 |
-
#
|
|
|
2 |
|
3 |
-
#
|
|
|
4 |
|
5 |
-
|
6 |
-
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
build-essential \
|
14 |
-
dpkg-dev \
|
15 |
-
wget \
|
16 |
-
openssh-server \
|
17 |
-
nano \
|
18 |
-
&& rm -rf /var/lib/apt/lists/*
|
19 |
|
20 |
-
# Setting up locales
|
21 |
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
|
25 |
-
#
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
RUN
|
|
|
29 |
|
30 |
-
#
|
31 |
-
|
32 |
-
RUN groupadd --gid 1020 llama-cpp-group
|
33 |
-
RUN useradd -rm -d /home/llama-cpp-user -s /bin/bash -G users,sudo,llama-cpp-group -u 1000 llama-cpp-user
|
34 |
-
|
35 |
-
# Update user password
|
36 |
-
RUN echo 'llama-cpp-user:admin' | chpasswd
|
37 |
-
|
38 |
-
# Updating conda to the latest version
|
39 |
-
RUN conda update conda -y
|
40 |
-
|
41 |
-
# Create virtalenv
|
42 |
-
RUN conda create -n llamacpp -y python=3.10.6
|
43 |
-
|
44 |
-
# Adding ownership of /opt/conda to $user
|
45 |
-
RUN chown -R llama-cpp-user:users /opt/conda
|
46 |
-
|
47 |
-
# conda init bash for $user
|
48 |
-
RUN su - llama-cpp-user -c "conda init bash"
|
49 |
-
|
50 |
-
# Download latest github/llama-cpp in llama.cpp directory and compile it
|
51 |
-
RUN su - llama-cpp-user -c "git clone https://github.com/ggerganov/llama.cpp.git ~/llama.cpp \
|
52 |
-
&& cd ~/llama.cpp \
|
53 |
-
&& make "
|
54 |
-
|
55 |
-
# Install Requirements for python virtualenv
|
56 |
-
RUN su - llama-cpp-user -c "cd ~/llama.cpp \
|
57 |
-
&& conda activate llamacpp \
|
58 |
-
&& python3 -m pip install -r requirements.txt "
|
59 |
-
|
60 |
-
# Download model
|
61 |
-
RUN su - llama-cpp-user -c "https://github.com/facebookresearch/llama.git ~/llama \
|
62 |
-
&& cd ~/llama \
|
63 |
-
&& ./download.sh "
|
64 |
-
|
65 |
-
# COPY entrypoint.sh /usr/bin/entrypoint
|
66 |
-
# RUN chmod 755 /usr/bin/entrypoint
|
67 |
-
# ENTRYPOINT ["/usr/bin/entrypoint"]
|
68 |
-
|
69 |
-
# Preparing for login
|
70 |
-
ENV HOME /home/llama-cpp-user
|
71 |
-
WORKDIR ${HOME}/llama.cpp
|
72 |
-
USER llama-cpp-user
|
73 |
-
CMD ["/bin/bash"]
|
|
|
1 |
+
# Loading base. I'm using Debian, u can use whatever u want.
|
2 |
+
FROM python:3.11.5-slim-bookworm
|
3 |
|
4 |
+
# Just for sure everything will be fine.
|
5 |
+
USER root
|
6 |
|
7 |
+
# Installing gcc compiler and main library.
|
8 |
+
RUN apt update && apt install gcc cmake build-essential -y
|
9 |
+
RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
|
10 |
|
11 |
+
# Copying files into folder and making it working dir.
|
12 |
+
RUN mkdir app
|
13 |
+
COPY . /app
|
14 |
+
RUN chmod -R 777 /app
|
15 |
+
WORKDIR /app
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
|
|
17 |
|
18 |
+
# Installing wget and downloading model.
|
19 |
+
ADD https://huggingface.co/TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF/resolve/main/dolphin-2.2.1-ashhlimarp-mistral-7b.Q4_K_M.gguf /app/model.bin
|
20 |
+
RUN chmod -R 777 /app/model.bin
|
21 |
+
# You can use other models! Or u can comment this two RUNs and include in Space/repo/Docker image own model with name "model.bin".
|
22 |
|
23 |
+
# Fixing warnings from Transformers and Matplotlib
|
24 |
+
RUN mkdir -p /.cache/huggingface/hub -m 777
|
25 |
+
RUN mkdir -p /.config/matplotlib -m 777
|
26 |
+
RUN chmod -R 777 /.cache
|
27 |
+
RUN chmod -R 777 /.config
|
28 |
|
29 |
+
# Updating pip and installing everything from requirements
|
30 |
+
RUN python3 -m pip install -U pip setuptools wheel
|
31 |
+
RUN pip install --upgrade -r /app/requirements.txt
|
32 |
|
33 |
+
# Now it's time to run Gradio app!
|
34 |
+
CMD ["python", "gradio_app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gradio_app.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Importing libraries
|
2 |
+
from llama_cpp import Llama
|
3 |
+
from time import time
|
4 |
+
import gradio as gr
|
5 |
+
import psutil
|
6 |
+
|
7 |
+
# Initing things
|
8 |
+
print("! INITING LLAMA MODEL !")
|
9 |
+
llm = Llama(model_path="./model.bin") # LLaMa model
|
10 |
+
llama_model_name = "TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF"
|
11 |
+
print("! INITING DONE !")
|
12 |
+
|
13 |
+
# Preparing things to work
|
14 |
+
title = "llama.cpp API"
|
15 |
+
desc = '''<h1>Hello, world!</h1>
|
16 |
+
This is showcase how to make own server with Llama2 model.<br>
|
17 |
+
I'm using here 7b model just for example. Also here's only CPU power.<br>
|
18 |
+
But you can use GPU power as well!<br><br>
|
19 |
+
<h1>How to GPU?</h1>
|
20 |
+
Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
|
21 |
+
<h1>How to test it on own machine?</h1>
|
22 |
+
You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
|
23 |
+
Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
|
24 |
+
<br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
|
25 |
+
Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a> and <a href="https://www.gradio.app/">Gradio</a>.<br><br>'''
|
26 |
+
|
27 |
+
# Loading prompt
|
28 |
+
prompt = ""
|
29 |
+
system_message = ""
|
30 |
+
|
31 |
+
def generate_answer(request: str, max_tokens: int = 256, custom_prompt: str = None):
|
32 |
+
t0 = time()
|
33 |
+
logs = f"Request: {request}\nMax tokens: {max_tokens}\nCustom prompt: {custom_prompt}\n"
|
34 |
+
try:
|
35 |
+
maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
|
36 |
+
userPrompt = prompt.replace("{prompt}", request)
|
37 |
+
userPrompt = userPrompt.replace(
|
38 |
+
"{system_message}",
|
39 |
+
custom_prompt if isinstance(custom_prompt, str) and len(custom_prompt.strip()) > 1 and custom_prompt.strip() not in ['', None, ' '] else system_message
|
40 |
+
)
|
41 |
+
logs += f"\nFinal prompt: {userPrompt}\n"
|
42 |
+
except:
|
43 |
+
return "Not enough data! Check that you passed all needed data.", logs
|
44 |
+
|
45 |
+
try:
|
46 |
+
# this shitty fix will be until i willnt figure out why sometimes there is empty output
|
47 |
+
counter = 1
|
48 |
+
while counter <= 3:
|
49 |
+
logs += f"Attempt {counter} to generate answer...\n"
|
50 |
+
output = llm(userPrompt, max_tokens=maxTokens, stop=["<|im_end|>"], echo=False)
|
51 |
+
text = output["choices"][0]["text"]
|
52 |
+
if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']:
|
53 |
+
break
|
54 |
+
counter += 1
|
55 |
+
logs += f"Final attempt: {counter}\n"
|
56 |
+
if len(text.strip()) <= 1 or text.strip() in ['', None, ' ']:
|
57 |
+
logs += f"Generated and aborted: {text}"
|
58 |
+
text = "Sorry, but something went wrong while generating answer. Try again or fix code. If you are maintainer of this space, look into logs."
|
59 |
+
|
60 |
+
logs += f"\nFinal: '''{text}'''"
|
61 |
+
logs += f"\n\nTime spent: {time()-t0}"
|
62 |
+
return text, logs
|
63 |
+
except Exception as e:
|
64 |
+
logs += str(e)
|
65 |
+
logs += f"\n\nTime spent: {time()-t0}"
|
66 |
+
return "Oops! Internal server error. Check the logs of space/instance.", logs
|
67 |
+
|
68 |
+
print("! LOAD GRADIO INTERFACE !")
|
69 |
+
demo = gr.Interface(
|
70 |
+
fn=generate_answer,
|
71 |
+
inputs=[
|
72 |
+
gr.components.Textbox(label="Input"),
|
73 |
+
gr.components.Number(value=256),
|
74 |
+
gr.components.Textbox(label="Custom system prompt"),
|
75 |
+
],
|
76 |
+
outputs=[
|
77 |
+
gr.components.Textbox(label="Output"),
|
78 |
+
gr.components.Textbox(label="Logs")
|
79 |
+
],
|
80 |
+
title=title,
|
81 |
+
description=desc,
|
82 |
+
allow_flagging='never'
|
83 |
+
)
|
84 |
+
demo.queue()
|
85 |
+
print("! LAUNCHING GRADIO !")
|
86 |
+
demo.launch(server_name="0.0.0.0")
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
psutil
|
2 |
+
gradio
|