llama-cpp-server

Paused

llama-cpp-server / Dockerfile

Update Dockerfile

572e6c5 verified over 1 year ago

1.53 kB

	ARG UBUNTU_VERSION=22.04
	# This needs to generally match the container host's environment.
	ARG CUDA_VERSION=11.7.1
	# Target the CUDA build image
	ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
	# Target the CUDA runtime image
	ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

	FROM ${BASE_CUDA_DEV_CONTAINER} as build

	# Unless otherwise specified, we make a fat build.
	ARG CUDA_DOCKER_ARCH=all

	RUN apt-get update && \
	apt-get install -y build-essential git

	WORKDIR /app

	COPY . .

	# Set nvcc architecture
	ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
	# Enable cuBLAS
	ENV LLAMA_CUBLAS=1

	RUN make

	FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

	COPY --from=build /app/main /main

	# Install build and runtime dependencies
	RUN apt-get update && \
	apt-get install -y \
	libopenblas-dev \
	ninja-build \
	build-essential \
	pkg-config \
	curl

	RUN pip install -U pip setuptools wheel && \
	pip install --verbose llama-cpp-python[server]

	# Download model
	RUN mkdir model && \
	curl -L https://huggingface.co/matthoffner/Magicoder-S-DS-6.7B-GGUF/resolve/main/Magicoder-S-DS-6.7B_Q4_K_M.gguf -o model/gguf-model.gguf

	COPY ./start_server.sh ./
	COPY ./main.py ./
	COPY ./index.html ./

	# Make the server start script executable
	RUN chmod +x ./start_server.sh

	# Set environment variable for the host
	ENV HOST=0.0.0.0
	ENV PORT=7860

	# Expose a port for the server
	EXPOSE ${PORT}

	# Run the server start script
	CMD ["/bin/sh", "./start_server.sh"]