Spaces:

choltha
/

free-CPU-inference-for-testing

Paused

App Files Files Community

Christoph Holthaus commited on Nov 29, 2023

Commit

2ae746c

1 Parent(s): ab2d9f0

took alot from https://huggingface.co/spaces/imperialwool/llama-cpp-api/blob/main

Browse files

Files changed (4) hide show

.gitignore +162 -0
Dockerfile +26 -65
gradio_app.py +86 -0
requirements.txt +2 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,162 @@

+# BASIC PYTHON .GITIGNORE + some for testing
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile CHANGED Viewed

@@ -1,73 +1,34 @@
-# Dockerfile to deploy a llama-cpp container with conda-ready environments
-# docker pull continuumio/miniconda3:latest
-ARG TAG=latest
-FROM continuumio/miniconda3:$TAG
-RUN apt-get update \
-    && DEBIAN_FRONTEND="noninteractive" apt-get install -y --no-install-recommends \
-        git \
-        locales \
-        sudo \
-        build-essential \
-        dpkg-dev \
-        wget \
-        openssh-server \
-        nano \
-    && rm -rf /var/lib/apt/lists/*
-# Setting up locales
-RUN locale-gen en_US.UTF-8
-ENV LANG en_US.UTF-8
-# SSH exposition
-EXPOSE 22/tcp
-RUN service ssh start
-# Create user
-RUN groupadd --gid 1020 llama-cpp-group
-RUN useradd -rm -d /home/llama-cpp-user -s /bin/bash -G users,sudo,llama-cpp-group -u 1000 llama-cpp-user
-# Update user password
-RUN echo 'llama-cpp-user:admin' | chpasswd
-# Updating conda to the latest version
-RUN conda update conda -y
-# Create virtalenv
-RUN conda create -n llamacpp -y python=3.10.6
-# Adding ownership of /opt/conda to $user
-RUN chown -R llama-cpp-user:users /opt/conda
-# conda init bash for $user
-RUN su - llama-cpp-user -c "conda init bash"
-# Download latest github/llama-cpp in llama.cpp directory and compile it
-RUN su - llama-cpp-user -c "git clone https://github.com/ggerganov/llama.cpp.git ~/llama.cpp \
-                            && cd ~/llama.cpp \
-                            && make "
-# Install Requirements for python virtualenv
-RUN su - llama-cpp-user -c "cd ~/llama.cpp \
-                            && conda activate llamacpp \
-                            && python3 -m pip install -r requirements.txt "
-# Download model
-RUN su - llama-cpp-user -c "https://github.com/facebookresearch/llama.git ~/llama \
-                           && cd ~/llama \
-                           && ./download.sh "
-# COPY entrypoint.sh /usr/bin/entrypoint
-# RUN chmod 755 /usr/bin/entrypoint
-# ENTRYPOINT ["/usr/bin/entrypoint"]
-# Preparing for login
-ENV HOME /home/llama-cpp-user
-WORKDIR ${HOME}/llama.cpp
-USER llama-cpp-user
-CMD ["/bin/bash"]

+# Loading base. I'm using Debian, u can use whatever u want.
+FROM python:3.11.5-slim-bookworm
+# Just for sure everything will be fine.
+USER root
+# Installing gcc compiler and main library.
+RUN apt update && apt install gcc cmake build-essential -y
+RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
+# Copying files into folder and making it working dir.
+RUN mkdir app
+COPY . /app
+RUN chmod -R 777 /app
+WORKDIR /app
+# Installing wget and downloading model.
+ADD https://huggingface.co/TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF/resolve/main/dolphin-2.2.1-ashhlimarp-mistral-7b.Q4_K_M.gguf /app/model.bin
+RUN chmod -R 777 /app/model.bin
+# You can use other models! Or u can comment this two RUNs and include in Space/repo/Docker image own model with name "model.bin".
+# Fixing warnings from Transformers and Matplotlib
+RUN mkdir -p /.cache/huggingface/hub -m 777
+RUN mkdir -p /.config/matplotlib -m 777
+RUN chmod -R 777 /.cache
+RUN chmod -R 777 /.config
+# Updating pip and installing everything from requirements
+RUN python3 -m pip install -U pip setuptools wheel
+RUN pip install --upgrade -r /app/requirements.txt
+# Now it's time to run Gradio app!
+CMD ["python", "gradio_app.py"]

gradio_app.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Importing libraries
+from llama_cpp import Llama
+from time import time
+import gradio as gr
+import psutil
+# Initing things
+print("! INITING LLAMA MODEL !")
+llm = Llama(model_path="./model.bin")                              # LLaMa model
+llama_model_name = "TheBloke/dolphin-2.2.1-AshhLimaRP-Mistral-7B-GGUF"
+print("! INITING DONE !")
+# Preparing things to work
+title = "llama.cpp API"
+desc = '''<h1>Hello, world!</h1>
+This is showcase how to make own server with Llama2 model.<br>
+I'm using here 7b model just for example. Also here's only CPU power.<br>
+But you can use GPU power as well!<br><br>
+<h1>How to GPU?</h1>
+Change <code>`CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS`</code> in Dockerfile on <code>`CMAKE_ARGS="-DLLAMA_CUBLAS=on"`</code>. Also you can try <code>`DLLAMA_CLBLAST`</code> or <code>`DLLAMA_METAL`</code>.<br><br>
+<h1>How to test it on own machine?</h1>
+You can install Docker, build image and run it. I made <code>`run-docker.sh`</code> for ya. To stop container run <code>`docker ps`</code>, find name of container and run <code>`docker stop _dockerContainerName_`</code><br>
+Or you can once follow steps in Dockerfile and try it on your machine, not in Docker.<br>
+<br>''' + f"Memory used: {psutil.virtual_memory()[2]}<br>" + '''
+Powered by <a href="https://github.com/abetlen/llama-cpp-python">llama-cpp-python</a> and <a href="https://www.gradio.app/">Gradio</a>.<br><br>'''
+# Loading prompt
+prompt = ""
+system_message = ""
+def generate_answer(request: str, max_tokens: int = 256, custom_prompt: str = None):
+    t0 = time()
+    logs = f"Request: {request}\nMax tokens: {max_tokens}\nCustom prompt: {custom_prompt}\n"
+    try:
+        maxTokens = max_tokens if 16 <= max_tokens <= 256 else 64
+        userPrompt = prompt.replace("{prompt}", request)
+        userPrompt = userPrompt.replace(
+            "{system_message}",
+            custom_prompt if isinstance(custom_prompt, str) and len(custom_prompt.strip()) > 1 and custom_prompt.strip() not in ['', None, ' '] else system_message
+        )
+        logs += f"\nFinal prompt: {userPrompt}\n"
+    except:
+        return "Not enough data! Check that you passed all needed data.", logs
+    try:
+        # this shitty fix will be until i willnt figure out why sometimes there is empty output
+        counter = 1
+        while counter <= 3:
+            logs += f"Attempt {counter} to generate answer...\n"
+            output = llm(userPrompt, max_tokens=maxTokens, stop=["<|im_end|>"], echo=False)
+            text = output["choices"][0]["text"]
+            if len(text.strip()) > 1 and text.strip() not in ['', None, ' ']:
+                break
+            counter += 1
+        logs += f"Final attempt: {counter}\n"
+        if len(text.strip()) <= 1 or text.strip() in ['', None, ' ']:
+            logs += f"Generated and aborted: {text}"
+            text = "Sorry, but something went wrong while generating answer. Try again or fix code. If you are maintainer of this space, look into logs."
+        logs += f"\nFinal: '''{text}'''"
+        logs += f"\n\nTime spent: {time()-t0}"
+        return text, logs
+    except Exception as e:
+        logs += str(e)
+        logs += f"\n\nTime spent: {time()-t0}"
+        return "Oops! Internal server error. Check the logs of space/instance.", logs
+print("! LOAD GRADIO INTERFACE !")
+demo = gr.Interface(
+    fn=generate_answer,
+    inputs=[
+        gr.components.Textbox(label="Input"),
+        gr.components.Number(value=256),
+        gr.components.Textbox(label="Custom system prompt"),
+    ],
+    outputs=[
+        gr.components.Textbox(label="Output"),
+        gr.components.Textbox(label="Logs")
+    ],
+    title=title,
+    description=desc,
+    allow_flagging='never'
+)
+demo.queue()
+print("! LAUNCHING GRADIO !")
+demo.launch(server_name="0.0.0.0")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ psutil
2	+ gradio