Spaces:

DanofficeIT
/

privatellm

Runtime error

App Files Files Community

lhhj commited on Nov 6, 2024

Commit

57e3690

1 Parent(s): 4aac1ef

first

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.clang-tidy +24 -0
.devops/cloud-v-pipeline +22 -0
.devops/full-cuda.Dockerfile +33 -0
.devops/full-musa.Dockerfile +26 -0
.devops/full-rocm.Dockerfile +50 -0
.devops/full.Dockerfile +25 -0
.devops/llama-cli-cann.Dockerfile +44 -0
.devops/llama-cli-cuda.Dockerfile +37 -0
.devops/llama-cli-intel.Dockerfile +28 -0
.devops/llama-cli-musa.Dockerfile +30 -0
.devops/llama-cli-rocm.Dockerfile +45 -0
.devops/llama-cli-vulkan.Dockerfile +27 -0
.devops/llama-cli.Dockerfile +23 -0
.devops/llama-cpp-cuda.srpm.spec +83 -0
.devops/llama-cpp.srpm.spec +85 -0
.devops/llama-server-cuda.Dockerfile +42 -0
.devops/llama-server-intel.Dockerfile +34 -0
.devops/llama-server-musa.Dockerfile +35 -0
.devops/llama-server-rocm.Dockerfile +54 -0
.devops/llama-server-vulkan.Dockerfile +31 -0
.devops/llama-server.Dockerfile +29 -0
.devops/nix/apps.nix +21 -0
.devops/nix/devshells.nix +52 -0
.devops/nix/docker.nix +37 -0
.devops/nix/jetson-support.nix +39 -0
.devops/nix/nixpkgs-instances.nix +45 -0
.devops/nix/package-gguf-py.nix +36 -0
.devops/nix/package.nix +246 -0
.devops/nix/python-scripts.nix +66 -0
.devops/nix/scope.nix +41 -0
.devops/nix/sif.nix +27 -0
.devops/tools.sh +41 -0
.dockerignore +20 -0
.ecrc +6 -0
.editorconfig +32 -0
.flake8 +17 -0
.gitattributes +1 -0
.gitignore +135 -0
.gitmodules +3 -0
.pre-commit-config.yaml +16 -0
AUTHORS +782 -0
CMakeLists.txt +216 -0
CMakePresets.json +81 -0
CONTRIBUTING.md +33 -0
LICENSE +21 -0
Makefile +1702 -0
Package.swift +80 -0
SECURITY.md +67 -0
ci/README.md +29 -0
ci/run.sh +851 -0

.clang-tidy ADDED Viewed

	@@ -0,0 +1,24 @@

+---
+Checks: >
+    bugprone-*,
+    -bugprone-easily-swappable-parameters,
+    -bugprone-implicit-widening-of-multiplication-result,
+    -bugprone-misplaced-widening-cast,
+    -bugprone-narrowing-conversions,
+    readability-*,
+    -readability-avoid-unconditional-preprocessor-if,
+    -readability-function-cognitive-complexity,
+    -readability-identifier-length,
+    -readability-implicit-bool-conversion,
+    -readability-magic-numbers,
+    -readability-uppercase-literal-suffix,
+    -readability-simplify-boolean-expr,
+    clang-analyzer-*,
+    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
+    performance-*,
+    portability-*,
+    misc-*,
+    -misc-const-correctness,
+    -misc-non-private-member-variables-in-classes,
+    -misc-no-recursion,
+FormatStyle: none

.devops/cloud-v-pipeline ADDED Viewed

	@@ -0,0 +1,22 @@

+node('x86_runner1'){            // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
+    stage('Cleanup'){
+        cleanWs()               // Cleaning previous CI build in workspace
+    }
+    stage('checkout repo'){
+        retry(5){               // Retry if the cloning fails due to some reason
+            checkout scm        // Clone the repo on Runner
+        }
+    }
+    stage('Compiling llama.cpp'){
+        sh'''#!/bin/bash
+            make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
+        '''
+    }
+    stage('Running llama.cpp'){
+        sh'''#!/bin/bash
+            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            cat llama_log.txt                   # Printing results
+        '''
+    }
+}

.devops/full-cuda.Dockerfile ADDED Viewed

	@@ -0,0 +1,33 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+WORKDIR /app
+COPY . .
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/full-musa.Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+RUN apt-get update && \
+    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release -j$(nproc) && \
+    cp build/bin/* .
+ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/full-rocm.Dockerfile ADDED Viewed

	@@ -0,0 +1,50 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH="\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102"
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+WORKDIR /app
+COPY . .
+# Set nvcc architecture
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+RUN make -j$(nproc)
+ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/full.Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+ARG UBUNTU_VERSION=22.04
+FROM ubuntu:$UBUNTU_VERSION AS build
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+WORKDIR /app
+COPY . .
+ENV LLAMA_CURL=1
+RUN make -j$(nproc)
+ENV LC_ALL=C.utf8
+ENTRYPOINT ["/app/.devops/tools.sh"]

.devops/llama-cli-cann.Dockerfile ADDED Viewed

	@@ -0,0 +1,44 @@

+ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
+FROM cosdt/cann:$ASCEND_VERSION AS build
+WORKDIR /app
+COPY . .
+RUN yum install -y gcc g++ cmake make
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+# find libascend_hal.so, because the drive hasn`t been mounted.
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
+RUN echo "Building with static libs" && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
+    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake --build build --config Release --target llama-cli
+# TODO: use image with NNRT
+FROM cosdt/cann:$ASCEND_VERSION AS runtime
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+ENV LC_ALL=C.utf8
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
+ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
+ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
+ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
+ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
+ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
+ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
+ENTRYPOINT ["/llama-cli" ]

.devops/llama-cli-cuda.Dockerfile ADDED Viewed

	@@ -0,0 +1,37 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+WORKDIR /app
+COPY . .
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc)
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+RUN apt-get update && \
+    apt-get install -y libgomp1
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+ENTRYPOINT [ "/llama-cli" ]

.devops/llama-cli-intel.Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git
+WORKDIR /app
+COPY . .
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with static libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
+    cmake --build build --config Release --target llama-cli
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+ENV LC_ALL=C.utf8
+ENTRYPOINT [ "/llama-cli" ]

.devops/llama-cli-musa.Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-cli -j$(nproc)
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+RUN apt-get update && \
+    apt-get install -y libgomp1
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /llama-cli
+ENTRYPOINT [ "/llama-cli" ]

.devops/llama-cli-rocm.Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH="\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102"
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+WORKDIR /app
+COPY . .
+# Set nvcc architecture
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+RUN make -j$(nproc) llama-cli
+ENTRYPOINT [ "/app/llama-cli" ]

.devops/llama-cli-vulkan.Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+ARG UBUNTU_VERSION=jammy
+FROM ubuntu:$UBUNTU_VERSION AS build
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
+# Install Vulkan SDK
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_VULKAN=1 && \
+    cmake --build build --config Release --target llama-cli
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/llama-cli /llama-cli && \
+    rm -rf /app
+ENV LC_ALL=C.utf8
+ENTRYPOINT [ "/llama-cli" ]

.devops/llama-cli.Dockerfile ADDED Viewed

	@@ -0,0 +1,23 @@

+ARG UBUNTU_VERSION=22.04
+FROM ubuntu:$UBUNTU_VERSION AS build
+RUN apt-get update && \
+    apt-get install -y build-essential git
+WORKDIR /app
+COPY . .
+RUN make -j$(nproc) llama-cli
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+RUN apt-get update && \
+    apt-get install -y libgomp1
+COPY --from=build /app/llama-cli /llama-cli
+ENV LC_ALL=C.utf8
+ENTRYPOINT [ "/llama-cli" ]

.devops/llama-cpp-cuda.srpm.spec ADDED Viewed

	@@ -0,0 +1,83 @@

+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - [email protected]
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+Name:           llama.cpp-cuda
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
+Requires:       cuda-toolkit
+URL:            https://github.com/ggerganov/llama.cpp
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+%description
+CPU inference for Meta's Lllama2 models using default options.
+%prep
+%setup -n llama.cpp-master
+%build
+make -j GGML_CUDA=1
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+[Install]
+WantedBy=default.target
+EOF
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+%files
+%{_bindir}/llama-cuda-cli
+%{_bindir}/llama-cuda-server
+%{_bindir}/llama-cuda-simple
+/usr/lib/systemd/system/llamacuda.service
+%config /etc/sysconfig/llama
+%pre
+%post
+%preun
+%postun
+%changelog

.devops/llama-cpp.srpm.spec ADDED Viewed

	@@ -0,0 +1,85 @@

+# SRPM for building from source and packaging an RPM for RPM-based distros.
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
+# Built and maintained by John Boero - [email protected]
+# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
+# Notes for llama.cpp:
+# 1. Tags are currently based on hash - which will not sort asciibetically.
+#    We need to declare standard versioning if people want to sort latest releases.
+#    In the meantime, YYYYMMDD format will be used.
+# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
+# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
+#    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
+# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
+#    It is up to the user to install the correct vendor-specific support.
+Name:           llama.cpp
+Version:        %( date "+%%Y%%m%%d" )
+Release:        1%{?dist}
+Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
+License:        MIT
+Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
+Requires:       libstdc++
+URL:            https://github.com/ggerganov/llama.cpp
+%define debug_package %{nil}
+%define source_date_epoch_from_changelog 0
+%description
+CPU inference for Meta's Lllama2 models using default options.
+Models are not included in this package and must be downloaded separately.
+%prep
+%setup -n llama.cpp-master
+%build
+make -j
+%install
+mkdir -p %{buildroot}%{_bindir}/
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
+mkdir -p %{buildroot}/usr/lib/systemd/system
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
+[Unit]
+Description=Llama.cpp server, CPU only (no GPU support in this build).
+After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
+[Service]
+Type=simple
+EnvironmentFile=/etc/sysconfig/llama
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
+ExecReload=/bin/kill -s HUP $MAINPID
+Restart=never
+[Install]
+WantedBy=default.target
+EOF
+mkdir -p %{buildroot}/etc/sysconfig
+%{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
+LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
+EOF
+%clean
+rm -rf %{buildroot}
+rm -rf %{_builddir}/*
+%files
+%{_bindir}/llama-cli
+%{_bindir}/llama-server
+%{_bindir}/llama-simple
+/usr/lib/systemd/system/llama.service
+%config /etc/sysconfig/llama
+%pre
+%post
+%preun
+%postun
+%changelog

.devops/llama-server-cuda.Dockerfile ADDED Viewed

	@@ -0,0 +1,42 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.6.0
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+# CUDA architecture to build for (defaults to all supported archs)
+ARG CUDA_DOCKER_ARCH=default
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+WORKDIR /app
+COPY . .
+# Use the default CUDA archs if not specified
+RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
+        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
+    fi && \
+    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc)
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-server /llama-server
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/llama-server" ]

.devops/llama-server-intel.Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+ARG GGML_SYCL_F16=OFF
+RUN apt-get update && \
+    apt-get install -y git libcurl4-openssl-dev
+WORKDIR /app
+COPY . .
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" && \
+        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    echo "Building with dynamic libs" && \
+    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake --build build --config Release --target llama-server
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev curl
+COPY --from=build /app/build/bin/llama-server /llama-server
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/llama-server" ]

.devops/llama-server-musa.Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG MUSA_VERSION=rc3.1.0
+# Target the MUSA build image
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the MUSA runtime image
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+FROM ${BASE_MUSA_DEV_CONTAINER} AS build
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake libcurl4-openssl-dev
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake --build build --config Release --target llama-server -j$(nproc)
+FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-server /llama-server
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/llama-server" ]

.devops/llama-server-rocm.Dockerfile ADDED Viewed

	@@ -0,0 +1,54 @@

+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG ROCM_VERSION=5.6
+# Target the CUDA build image
+ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build
+# Unless otherwise specified, we make a fat build.
+# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# This is mostly tied to rocBLAS supported archs.
+ARG ROCM_DOCKER_ARCH="\
+    gfx803 \
+    gfx900 \
+    gfx906 \
+    gfx908 \
+    gfx90a \
+    gfx1010 \
+    gfx1030 \
+    gfx1100 \
+    gfx1101 \
+    gfx1102"
+COPY requirements.txt   requirements.txt
+COPY requirements       requirements
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install -r requirements.txt
+WORKDIR /app
+COPY . .
+# Set nvcc architecture
+ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
+# Enable ROCm
+ENV GGML_HIPBLAS=1
+ENV CC=/opt/rocm/llvm/bin/clang
+ENV CXX=/opt/rocm/llvm/bin/clang++
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev curl
+RUN make -j$(nproc) llama-server
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/app/llama-server" ]

.devops/llama-server-vulkan.Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+ARG UBUNTU_VERSION=jammy
+FROM ubuntu:$UBUNTU_VERSION AS build
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+# Install Vulkan SDK and cURL
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+    apt update -y && \
+    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
+# Build it
+WORKDIR /app
+COPY . .
+RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+    cmake --build build --config Release --target llama-server
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/llama-server /llama-server && \
+    rm -rf /app
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/llama-server" ]

.devops/llama-server.Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+ARG UBUNTU_VERSION=22.04
+FROM ubuntu:$UBUNTU_VERSION AS build
+RUN apt-get update && \
+    apt-get install -y build-essential git libcurl4-openssl-dev
+WORKDIR /app
+COPY . .
+ENV LLAMA_CURL=1
+RUN make -j$(nproc) llama-server
+FROM ubuntu:$UBUNTU_VERSION AS runtime
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev libgomp1 curl
+COPY --from=build /app/llama-server /llama-server
+ENV LC_ALL=C.utf8
+# Must be set to 0.0.0.0 so it can listen to requests from host machine
+ENV LLAMA_ARG_HOST=0.0.0.0
+HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
+ENTRYPOINT [ "/llama-server" ]

.devops/nix/apps.nix ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  perSystem =
+    { config, lib, ... }:
+    {
+      apps =
+        let
+          inherit (config.packages) default;
+          binaries = [
+            "llama-cli"
+            "llama-embedding"
+            "llama-server"
+            "llama-quantize"
+          ];
+          mkApp = name: {
+            type = "app";
+            program = "${default}/bin/${name}";
+          };
+        in
+        lib.genAttrs binaries mkApp;
+    };
+}

.devops/nix/devshells.nix ADDED Viewed

	@@ -0,0 +1,52 @@

+{ inputs, ... }:
+{
+  perSystem =
+    {
+      config,
+      lib,
+      system,
+      ...
+    }:
+    {
+      devShells =
+        let
+          pkgs = import inputs.nixpkgs { inherit system; };
+          stdenv = pkgs.stdenv;
+          scripts = config.packages.python-scripts;
+        in
+        lib.pipe (config.packages) [
+          (lib.concatMapAttrs (
+            name: package: {
+              ${name} = pkgs.mkShell {
+                name = "${name}";
+                inputsFrom = [ package ];
+                shellHook = ''
+                  echo "Entering ${name} devShell"
+                '';
+              };
+              "${name}-extra" =
+                if (name == "python-scripts") then
+                  null
+                else
+                  pkgs.mkShell {
+                    name = "${name}-extra";
+                    inputsFrom = [
+                      package
+                      scripts
+                    ];
+                    # Extra packages that *may* be used by some scripts
+                    packages = [
+                        pkgs.python3Packages.tiktoken
+                    ];
+                    shellHook = ''
+                      echo "Entering ${name} devShell"
+                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
+                    '';
+                  };
+            }
+          ))
+          (lib.filterAttrs (name: value: value != null))
+        ];
+    };
+}

.devops/nix/docker.nix ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  lib,
+  dockerTools,
+  buildEnv,
+  llama-cpp,
+  interactive ? true,
+  coreutils,
+}:
+# A tar that can be fed into `docker load`:
+#
+# $ nix build .#llamaPackages.docker
+# $ docker load < result
+# For details and variations cf.
+# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
+# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
+# - https://nixery.dev/
+# Approximate (compressed) sizes, at the time of writing, are:
+#
+# .#llamaPackages.docker: 125M;
+# .#llamaPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+dockerTools.buildLayeredImage {
+  name = llama-cpp.pname;
+  tag = "latest";
+  contents =
+    [ llama-cpp ]
+    ++ lib.optionals interactive [
+      coreutils
+      dockerTools.binSh
+      dockerTools.caCertificates
+    ];
+}

.devops/nix/jetson-support.nix ADDED Viewed

	@@ -0,0 +1,39 @@

+{ inputs, ... }:
+{
+  perSystem =
+    {
+      config,
+      system,
+      lib,
+      pkgsCuda,
+      ...
+    }:
+    {
+      legacyPackages =
+        let
+          caps.llamaPackagesXavier = "7.2";
+          caps.llamaPackagesOrin = "8.7";
+          caps.llamaPackagesTX2 = "6.2";
+          caps.llamaPackagesNano = "5.3";
+          pkgsFor =
+            cap:
+            import inputs.nixpkgs {
+              inherit system;
+              config = {
+                cudaSupport = true;
+                cudaCapabilities = [ cap ];
+                cudaEnableForwardCompat = false;
+                inherit (pkgsCuda.config) allowUnfreePredicate;
+              };
+            };
+        in
+        builtins.mapAttrs (name: cap: (pkgsFor cap).callPackage ./scope.nix { }) caps;
+      packages = lib.optionalAttrs (system == "aarch64-linux") {
+        jetson-xavier = config.legacyPackages.llamaPackagesXavier.llama-cpp;
+        jetson-orin = config.legacyPackages.llamaPackagesOrin.llama-cpp;
+        jetson-nano = config.legacyPackages.llamaPackagesNano.llama-cpp;
+      };
+    };
+}

.devops/nix/nixpkgs-instances.nix ADDED Viewed

	@@ -0,0 +1,45 @@

+{ inputs, ... }:
+{
+  # The _module.args definitions are passed on to modules as arguments. E.g.
+  # the module `{ pkgs ... }: { /* config */ }` implicitly uses
+  # `_module.args.pkgs` (defined in this case by flake-parts).
+  perSystem =
+    { system, ... }:
+    {
+      _module.args = {
+        # Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
+        # again, the below creates several nixpkgs instances which the
+        # flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
+        #
+        # This is currently "slow" and "expensive", on a certain scale.
+        # This also isn't "right" in that this hinders dependency injection at
+        # the level of flake inputs. This might get removed in the foreseeable
+        # future.
+        #
+        # Note that you can use these expressions without Nix
+        # (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
+        pkgsCuda = import inputs.nixpkgs {
+          inherit system;
+          # Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
+          # and ucx are built with CUDA support)
+          config.cudaSupport = true;
+          config.allowUnfreePredicate =
+            p:
+            builtins.all (
+              license:
+              license.free
+              || builtins.elem license.shortName [
+                "CUDA EULA"
+                "cuDNN EULA"
+              ]
+            ) (p.meta.licenses or [ p.meta.license ]);
+        };
+        # Ensure dependencies use ROCm consistently
+        pkgsRocm = import inputs.nixpkgs {
+          inherit system;
+          config.rocmSupport = true;
+        };
+      };
+    };
+}

.devops/nix/package-gguf-py.nix ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  lib,
+  llamaVersion,
+  numpy,
+  tqdm,
+  sentencepiece,
+  pyyaml,
+  poetry-core,
+  buildPythonPackage,
+  pytestCheckHook,
+}:
+buildPythonPackage {
+  pname = "gguf";
+  version = llamaVersion;
+  pyproject = true;
+  nativeBuildInputs = [ poetry-core ];
+  propagatedBuildInputs = [
+    numpy
+    tqdm
+    sentencepiece
+    pyyaml
+  ];
+  src = lib.cleanSource ../../gguf-py;
+  pythonImportsCheck = [
+    "numpy"
+    "gguf"
+  ];
+  nativeCheckInputs = [ pytestCheckHook ];
+  doCheck = true;
+  meta = with lib; {
+    description = "Python package for writing binary files in the GGUF format";
+    license = licenses.mit;
+    maintainers = [ maintainers.ditsuke ];
+  };
+}

.devops/nix/package.nix ADDED Viewed

	@@ -0,0 +1,246 @@

+{
+  lib,
+  glibc,
+  config,
+  stdenv,
+  runCommand,
+  cmake,
+  ninja,
+  pkg-config,
+  git,
+  mpi,
+  blas,
+  cudaPackages,
+  autoAddDriverRunpath,
+  darwin,
+  rocmPackages,
+  vulkan-headers,
+  vulkan-loader,
+  curl,
+  shaderc,
+  useBlas ?
+    builtins.all (x: !x) [
+      useCuda
+      useMetalKit
+      useRocm
+      useVulkan
+    ]
+    && blas.meta.available,
+  useCuda ? config.cudaSupport,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
+  # Increases the runtime closure size by ~700M
+  useMpi ? false,
+  useRocm ? config.rocmSupport,
+  enableCurl ? true,
+  useVulkan ? false,
+  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
+  # It's necessary to consistently use backendStdenv when building with CUDA support,
+  # otherwise we get libstdc++ errors downstream.
+  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
+  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
+  precompileMetalShaders ? false,
+}:
+let
+  inherit (lib)
+    cmakeBool
+    cmakeFeature
+    optionals
+    strings
+    ;
+  stdenv = throw "Use effectiveStdenv instead";
+  suffices =
+    lib.optionals useBlas [ "BLAS" ]
+    ++ lib.optionals useCuda [ "CUDA" ]
+    ++ lib.optionals useMetalKit [ "MetalKit" ]
+    ++ lib.optionals useMpi [ "MPI" ]
+    ++ lib.optionals useRocm [ "ROCm" ]
+    ++ lib.optionals useVulkan [ "Vulkan" ];
+  pnameSuffix =
+    strings.optionalString (suffices != [ ])
+      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
+  descriptionSuffix = strings.optionalString (
+    suffices != [ ]
+  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
+  xcrunHost = runCommand "xcrunHost" { } ''
+    mkdir -p $out/bin
+    ln -s /usr/bin/xcrun $out/bin
+  '';
+  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
+  # separately
+  darwinBuildInputs =
+    with darwin.apple_sdk.frameworks;
+    [
+      Accelerate
+      CoreVideo
+      CoreGraphics
+    ]
+    ++ optionals useMetalKit [ MetalKit ];
+  cudaBuildInputs = with cudaPackages; [
+    cuda_cudart
+    cuda_cccl # <nv/target>
+    libcublas
+  ];
+  rocmBuildInputs = with rocmPackages; [
+    clr
+    hipblas
+    rocblas
+  ];
+  vulkanBuildInputs = [
+    vulkan-headers
+    vulkan-loader
+    shaderc
+  ];
+in
+effectiveStdenv.mkDerivation (finalAttrs: {
+  pname = "llama-cpp${pnameSuffix}";
+  version = llamaVersion;
+  # Note: none of the files discarded here are visible in the sandbox or
+  # affect the output hash. This also means they can be modified without
+  # triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        noneOf = builtins.all (x: !x);
+        baseName = baseNameOf name;
+      in
+      noneOf [
+        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+        (lib.hasPrefix "." baseName) # Skip hidden files and directories
+        (baseName == "flake.lock")
+      ];
+    src = lib.cleanSource ../../.;
+  };
+  postPatch = ''
+    substituteInPlace ./ggml/src/ggml-metal.m \
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
+    substituteInPlace ./ggml/src/ggml-metal.m \
+      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
+  '';
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # `default.metallib` may be compiled with Metal compiler from XCode
+  # and we need to escape sandbox on MacOS to access Metal compiler.
+  # `xcrun` is used find the path of the Metal compiler, which is varible
+  # and not on $PATH
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
+  nativeBuildInputs =
+    [
+      cmake
+      ninja
+      pkg-config
+      git
+    ]
+    ++ optionals useCuda [
+      cudaPackages.cuda_nvcc
+      autoAddDriverRunpath
+    ]
+    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
+    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
+  buildInputs =
+    optionals effectiveStdenv.isDarwin darwinBuildInputs
+    ++ optionals useCuda cudaBuildInputs
+    ++ optionals useMpi [ mpi ]
+    ++ optionals useRocm rocmBuildInputs
+    ++ optionals useBlas [ blas ]
+    ++ optionals useVulkan vulkanBuildInputs
+    ++ optionals enableCurl [ curl ];
+  cmakeFlags =
+    [
+      (cmakeBool "LLAMA_BUILD_SERVER" true)
+      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
+      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+      (cmakeBool "LLAMA_CURL" enableCurl)
+      (cmakeBool "GGML_NATIVE" false)
+      (cmakeBool "GGML_BLAS" useBlas)
+      (cmakeBool "GGML_CUDA" useCuda)
+      (cmakeBool "GGML_HIPBLAS" useRocm)
+      (cmakeBool "GGML_METAL" useMetalKit)
+      (cmakeBool "GGML_VULKAN" useVulkan)
+      (cmakeBool "GGML_STATIC" enableStatic)
+    ]
+    ++ optionals useCuda [
+      (
+        with cudaPackages.flags;
+        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+        )
+      )
+    ]
+    ++ optionals useRocm [
+      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+    ]
+    ++ optionals useMetalKit [
+      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
+      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
+    ];
+  # Environment variables needed for ROCm
+  env = optionals useRocm {
+    ROCM_PATH = "${rocmPackages.clr}";
+    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
+  };
+  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+  # if they haven't been added yet.
+  postInstall = ''
+    mkdir -p $out/include
+    cp $src/include/llama.h $out/include/
+  '';
+  meta = {
+    # Configurations we don't want even the CI to evaluate. Results in the
+    # "unsupported platform" messages. This is mostly a no-op, because
+    # cudaPackages would've refused to evaluate anyway.
+    badPlatforms = optionals useCuda lib.platforms.darwin;
+    # Configurations that are known to result in build failures. Can be
+    # overridden by importing Nixpkgs with `allowBroken = true`.
+    broken = (useMetalKit && !effectiveStdenv.isDarwin);
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
+    license = lib.licenses.mit;
+    # Accommodates `nix run` and `lib.getExe`
+    mainProgram = "llama-cli";
+    # These people might respond, on the best effort basis, if you ping them
+    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
+    # Consider adding yourself to this list if you want to ensure this flake
+    # stays maintained and you're willing to invest your time. Do not add
+    # other people without their consent. Consider removing people after
+    # they've been unreachable for long periods of time.
+    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
+    # an attrset following the same format as in
+    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
+    maintainers = with lib.maintainers; [
+      philiptaron
+      SomeoneSerge
+    ];
+    # Extend `badPlatforms` instead
+    platforms = lib.platforms.all;
+  };
+})

.devops/nix/python-scripts.nix ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  lib,
+  stdenv,
+  buildPythonPackage,
+  poetry-core,
+  mkShell,
+  python3Packages,
+  gguf-py,
+}@inputs:
+let
+  llama-python-deps = with python3Packages; [
+    numpy
+    sentencepiece
+    transformers
+    protobuf
+    torchWithoutCuda
+    gguf-py
+    tqdm
+    # for scripts/compare-llama-bench.py
+    gitpython
+    tabulate
+    # for examples/pydantic-models-to-grammar-examples.py
+    docstring-parser
+    pydantic
+  ];
+  llama-python-test-deps = with python3Packages; [
+    # Server bench
+    matplotlib
+    # server tests
+    openai
+    behave
+    prometheus-client
+  ];
+in
+buildPythonPackage ({
+  pname = "llama-scripts";
+  version = "0.0.0";
+  pyproject = true;
+  # NOTE: The files filtered out here are not visible in the build sandbox, neither
+  # do they affect the output hash. They can be modified without triggering a rebuild.
+  src = lib.cleanSourceWith {
+    filter =
+      name: type:
+      let
+        any = builtins.any (x: x);
+        baseName = builtins.baseNameOf name;
+      in
+      any [
+        (lib.hasSuffix ".py" name)
+        (baseName == "README.md")
+        (baseName == "pyproject.toml")
+      ];
+    src = lib.cleanSource ../../.;
+  };
+  nativeBuildInputs = [ poetry-core ];
+  nativeCheckInputs = llama-python-test-deps;
+  dependencies = llama-python-deps;
+})

.devops/nix/scope.nix ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  lib,
+  newScope,
+  python3,
+  llamaVersion ? "0.0.0",
+}:
+let
+  pythonPackages = python3.pkgs;
+  buildPythonPackage = pythonPackages.buildPythonPackage;
+  numpy = pythonPackages.numpy;
+  tqdm = pythonPackages.tqdm;
+  sentencepiece = pythonPackages.sentencepiece;
+  pyyaml = pythonPackages.pyyaml;
+  poetry-core = pythonPackages.poetry-core;
+  pytestCheckHook = pythonPackages.pytestCheckHook;
+in
+# We're using `makeScope` instead of just writing out an attrset
+# because it allows users to apply overlays later using `overrideScope'`.
+# Cf. https://noogle.dev/f/lib/makeScope
+lib.makeScope newScope (self: {
+  inherit llamaVersion;
+  gguf-py = self.callPackage ./package-gguf-py.nix {
+    inherit
+      buildPythonPackage
+      numpy
+      tqdm
+      sentencepiece
+      poetry-core
+      pyyaml
+      pytestCheckHook
+      ;
+  };
+  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
+  llama-cpp = self.callPackage ./package.nix { };
+  docker = self.callPackage ./docker.nix { };
+  docker-min = self.callPackage ./docker.nix { interactive = false; };
+  sif = self.callPackage ./sif.nix { };
+})

.devops/nix/sif.nix ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  lib,
+  singularity-tools,
+  llama-cpp,
+  bashInteractive,
+  interactive ? false,
+}:
+let
+  optionalInt = cond: x: if cond then x else 0;
+in
+singularity-tools.buildImage rec {
+  inherit (llama-cpp) name;
+  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+  # These are excessive (but safe) for most variants. Building singularity
+  # images requires superuser privileges, so we build them inside a VM in a
+  # writable image of pre-determined size.
+  #
+  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
+  #
+  # Expected image sizes:
+  # - cpu/blas: 150M,
+  # - cuda, all gencodes: 560M,
+  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+  memSize = diskSize;
+}

.devops/tools.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/bash
+set -e
+# Read the first argument into a variable
+arg1="$1"
+# Shift the arguments to remove the first one
+shift
+if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
+    python3 ./convert_hf_to_gguf.py "$@"
+elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
+    ./llama-quantize "$@"
+elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
+    ./llama-cli "$@"
+elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
+    echo "Converting PTH to GGML..."
+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+        if [ -f "${i/f16/q4_0}" ]; then
+            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
+        else
+            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
+            ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
+        fi
+    done
+elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
+    ./llama-server "$@"
+else
+    echo "Unknown command: $arg1"
+    echo "Available commands: "
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --convert (-c): Convert a llama model into ggml"
+    echo "              ex: --outtype f16 \"/models/7B/\" "
+    echo "  --quantize (-q): Optimize with quantization process ggml"
+    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --all-in-one (-a): Execute --convert & --quantize"
+    echo "              ex: \"/models/\" 7B"
+    echo "  --server (-s): Run a model on the server"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -c 2048 -ngl 43 -mg 1 --port 8080"
+fi

.dockerignore ADDED Viewed

	@@ -0,0 +1,20 @@

+*.o
+*.a
+.cache/
+# Do not ignore .git directory, otherwise the reported build number will always be 0
+.github/
+.gitignore
+.vs/
+.vscode/
+.DS_Store
+build*/
+models/*
+/llama-cli
+/llama-quantize
+arm_neon.h
+compile_commands.json
+Dockerfile

.ecrc ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
+  "Disable": {
+    "IndentSize": true
+  }
+}

.editorconfig ADDED Viewed

	@@ -0,0 +1,32 @@

+# https://EditorConfig.org
+# Top-most EditorConfig file
+root = true
+# Unix-style newlines with a newline ending every file, utf-8 charset
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+charset = utf-8
+indent_style = space
+indent_size = 4
+[Makefile]
+indent_style = tab
+[scripts/*.mk]
+indent_style = tab
+[prompts/*.txt]
+insert_final_newline = unset
+[examples/server/public/*]
+indent_size = 2
+[examples/llama.swiftui/llama.swiftui.xcodeproj/*]
+indent_style = tab
+[examples/cvector-generator/*.txt]
+trim_trailing_whitespace = unset
+insert_final_newline = unset

.flake8 ADDED Viewed

	@@ -0,0 +1,17 @@

+[flake8]
+max-line-length = 125
+ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
+exclude =
+    # Do not traverse examples
+    examples,
+    # Do not include package initializers
+    __init__.py,
+    # No need to traverse our git directory
+    .git,
+    # There's no value in checking cache directories
+    __pycache__,
+    # No need to include the build path
+    build,
+    # This contains builds that we don't want to check
+    dist  # This is generated with `python build .` for package releases
+# max-complexity = 10

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,135 @@

+# Extensions
+*.a
+*.bat
+*.bin
+*.dll
+*.dot
+*.etag
+*.exe
+*.gcda
+*.gcno
+*.gcov
+*.gguf
+*.gguf.json
+*.lastModified
+*.log
+*.metallib
+*.o
+*.so
+*.tmp
+# IDE / OS
+.cache/
+.ccls-cache/
+.direnv/
+.DS_Store
+.envrc
+.idea/
+.swiftpm
+.vs/
+.vscode/
+nppBackup
+# Coverage
+gcovr-report/
+lcov-report/
+# Build Artifacts
+tags
+.build/
+build*
+!build-info.cmake
+!build-info.cpp.in
+!build-info.sh
+!build.zig
+!docs/build.md
+/libllama.so
+/llama-*
+/vulkan-shaders-gen
+android-ndk-*
+arm_neon.h
+cmake-build-*
+CMakeSettings.json
+compile_commands.json
+ggml-metal-embed.metal
+llama-batched-swift
+/rpc-server
+out/
+tmp/
+autogen-*.md
+# Deprecated
+/main
+/server
+# CI
+!.github/workflows/*.yml
+# Models
+models/*
+models-mnt
+!models/.editorconfig
+!models/ggml-vocab-*.gguf*
+# Zig
+zig-out/
+zig-cache/
+# Logs
+ppl-*.txt
+qnt-*.txt
+perf-*.txt
+# Examples
+examples/jeopardy/results.txt
+examples/server/*.css.hpp
+examples/server/*.html.hpp
+examples/server/*.js.hpp
+examples/server/*.mjs.hpp
+!build_64.sh
+!examples/*.bat
+!examples/*/*.kts
+!examples/*/*/*.kts
+!examples/sycl/*.bat
+!examples/sycl/*.sh
+# Python
+/.venv
+__pycache__/
+*/poetry.lock
+poetry.toml
+# Nix
+/result
+# Test binaries
+/tests/test-backend-ops
+/tests/test-double-float
+/tests/test-grad0
+/tests/test-grammar-parser
+/tests/test-llama-grammar
+/tests/test-opt
+/tests/test-quantize-fns
+/tests/test-quantize-perf
+/tests/test-rope
+/tests/test-sampling
+/tests/test-tokenizer-0
+/tests/test-tokenizer-1-bpe
+/tests/test-tokenizer-1-spm
+# Scripts
+!/scripts/install-oneapi.bat
+# Test models for lora adapters
+/lora-tests

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "kompute"]
+	path = ggml/src/kompute
+	url = https://github.com/nomic-ai/kompute.git

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+exclude: prompts/.*.txt
+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v4.6.0
+  hooks:
+  - id: trailing-whitespace
+  - id: end-of-file-fixer
+  - id: check-yaml
+  - id: check-added-large-files
+- repo: https://github.com/PyCQA/flake8
+  rev: 7.0.0
+  hooks:
+  -   id: flake8
+      additional_dependencies: [flake8-no-print]

AUTHORS ADDED Viewed

	@@ -0,0 +1,782 @@

+# date: Wed Jun 26 19:36:34 EEST 2024
+# this file is auto-generated by scripts/gen-authors.sh
+0cc4m <[email protected]>
+0xspringtime <[email protected]>
+20kdc <[email protected]>
+2f38b454 <[email protected]>
+3ooabkhxtn <[email protected]>
+44670 <[email protected]>
+AN Long <[email protected]>
+AT <[email protected]>
+Aarni Koskela <[email protected]>
+Aaron Miller <[email protected]>
+Aaryaman Vasishta <[email protected]>
+Abheek Gulati <[email protected]>
+Abhilash Majumder <[email protected]>
+Abhishek Gopinath K <[email protected]>
+Adithya Balaji <[email protected]>
+AdithyanI <[email protected]>
+Adrian <[email protected]>
+Adrian Hesketh <[email protected]>
+Ahmet Zeer <[email protected]>
+AidanBeltonS <[email protected]>
+Aisuko <[email protected]>
+Akarshan Biswas <[email protected]>
+Albert Jin <[email protected]>
+Alberto <[email protected]>
+Alex <[email protected]>
+Alex Azarov <[email protected]>
+Alex Azarov <[email protected]>
+Alex Klinkhamer <[email protected]>
+Alex Klinkhamer <[email protected]>
+Alex Nguyen <[email protected]>
+Alex Petenchea <[email protected]>
+Alex Renda <[email protected]>
+Alex von Gluck IV <[email protected]>
+Alexey Parfenov <[email protected]>
+Ali Chraghi <[email protected]>
+Ali Nehzat <[email protected]>
+Ali Tariq <[email protected]>
+Alon <[email protected]>
+AlpinDale <[email protected]>
+Amir <[email protected]>
+AmirAli Mirian <[email protected]>
+Ananta Bastola <[email protected]>
+Anas Ahouzi <[email protected]>
+András Salamon <[email protected]>
+Andrei <[email protected]>
+Andrew Canis <[email protected]>
+Andrew Downing <[email protected]>
+Andrew Duffy <[email protected]>
+Andrew Godfrey <[email protected]>
+Andy Tai <[email protected]>
+Arik Poznanski <[email protected]>
+Artem <[email protected]>
+Artem Zinnatullin <[email protected]>
+Artyom Lebedev <[email protected]>
+Asbjørn Olling <[email protected]>
+Ásgeir Bjarni Ingvarsson <[email protected]>
+Ashish <[email protected]>
+Ashok Gelal <[email protected]>
+Ashraful Islam <[email protected]>
+Atsushi Tatsuma <[email protected]>
+Austin <[email protected]>
+AustinMroz <[email protected]>
+BADR <[email protected]>
+Bach Le <[email protected]>
+Bailey Chittle <[email protected]>
+BarfingLemurs <[email protected]>
+Bartowski <[email protected]>
+Behnam M <[email protected]>
+Ben Ashbaugh <[email protected]>
+Ben Garney <[email protected]>
+Ben Siraphob <[email protected]>
+Ben Williams <[email protected]>
+Benjamin Findley <[email protected]>
+Benjamin Lecaillon <[email protected]>
+Bernat Vadell <[email protected]>
+Bingan <[email protected]>
+Bodo Graumann <[email protected]>
+Bono Lv <[email protected]>
+Borislav Stanimirov <[email protected]>
+Branden Butler <[email protected]>
+Brian <[email protected]>
+Bruce MacDonald <[email protected]>
+Bryan Honof <[email protected]>
+CJ Pais <[email protected]>
+CRD716 <[email protected]>
+Calvin Laurenson <[email protected]>
+Cameron <[email protected]>
+Cameron Kaiser <[email protected]>
+Carolinabanana <[email protected]>
+Casey Primozic <[email protected]>
+Casey Primozic <[email protected]>
+CausalLM <[email protected]>
+Cebtenzzre <[email protected]>
+Chad Brewbaker <[email protected]>
+Chao Jiang <[email protected]>
+Cheng Shao <[email protected]>
+Chris Elrod <[email protected]>
+Chris Kuehl <[email protected]>
+Christian Demsar <[email protected]>
+Christian Demsar <[email protected]>
+Christian Falch <[email protected]>
+Christian Kögler <[email protected]>
+Christian Zhou-Zheng <[email protected]>
+Clark Saben <[email protected]>
+Clint Herron <[email protected]>
+CrispStrobe <[email protected]>
+Cuong Trinh Manh <[email protected]>
+DAN™ <[email protected]>
+Damian Stewart <[email protected]>
+Dane Madsen <[email protected]>
+DaniAndTheWeb <[email protected]>
+Daniel Bevenius <[email protected]>
+Daniel Drake <[email protected]>
+Daniel Hiltgen <[email protected]>
+Daniel Illescas Romero <[email protected]>
+Daniele <[email protected]>
+DannyDaemonic <[email protected]>
+Dat Quoc Nguyen <[email protected]>
+Dave <[email protected]>
+Dave Airlie <[email protected]>
+Dave Airlie <[email protected]>
+Dave Della Costa <[email protected]>
+David Friehs <[email protected]>
+David Kennedy <[email protected]>
+David Pflug <[email protected]>
+David Renshaw <[email protected]>
+David Sommers <[email protected]>
+David Yang <[email protected]>
+Dawid Potocki <[email protected]>
+Dawid Wysocki <[email protected]>
+Dean <[email protected]>
+Deins <[email protected]>
+Deven Mistry <[email protected]>
+Didzis Gosko <[email protected]>
+Djip007 <[email protected]>
+Don Mahurin <[email protected]>
+DooWoong Lee (David) <[email protected]>
+Doomsdayrs <[email protected]>
+Douglas Hanley <[email protected]>
+Dr. Tom Murphy VII Ph.D <[email protected]>
+Ebey Abraham <[email protected]>
+Ed Lee <[email protected]>
+Ed Lepedus <[email protected]>
+Eddie-Wang <[email protected]>
+Edward Taylor <[email protected]>
+Elaine <[email protected]>
+Elbios <[email protected]>
+Elton Kola <[email protected]>
+Engininja2 <[email protected]>
+Equim <[email protected]>
+Eric Sommerlade <[email protected]>
+Eric Zhang <[email protected]>
+Erik Garrison <[email protected]>
+Erik Scholz <[email protected]>
+Ettore Di Giacinto <[email protected]>
+Evan Jones <[email protected]>
+Evan Miller <[email protected]>
+Eve <[email protected]>
+Evgeny Kurnevsky <[email protected]>
+Ewout ter Hoeven <[email protected]>
+ExtReMLapin <[email protected]>
+FK <[email protected]>
+Fabian <[email protected]>
+Fabio R. Sluzala <[email protected]>
+Faez Shakil <[email protected]>
+FantasyGmm <[email protected]>
+Fattire <[email protected]>
+Felix <[email protected]>
+Finn Voorhees <[email protected]>
+Firat <[email protected]>
+Folko-Ven <[email protected]>
+Foul-Tarnished <[email protected]>
+Francisco Melo <[email protected]>
+Frank Mai <[email protected]>
+FrankHB <[email protected]>
+Fred Douglas <[email protected]>
+Frederik Vogel <[email protected]>
+Gabe Goodhart <[email protected]>
+GainLee <[email protected]>
+Galunid <[email protected]>
+Gary Linscott <[email protected]>
+Gary Mulder <[email protected]>
+Gavin Zhao <[email protected]>
+Genkagaku.GPT <[email protected]>
+Georgi Gerganov <[email protected]>
+Gilad S <[email protected]>
+Giuseppe Scrivano <[email protected]>
+GiviMAD <[email protected]>
+Govlzkoy <[email protected]>
+Guillaume "Vermeille" Sanchez <[email protected]>
+Guillaume Wenzek <[email protected]>
+Guoteng <[email protected]>
+Gustavo Rocha Dias <[email protected]>
+Haggai Nuchi <[email protected]>
+Halalaluyafail3 <[email protected]>
+Hamdoud Hakem <[email protected]>
+HanishKVC <[email protected]>
+Haohui Mai <[email protected]>
+Haoxiang Fei <[email protected]>
+Harald Fernengel <[email protected]>
+Hatsune Miku <[email protected]>
+HatsuneMikuUwU33 <[email protected]>
+Henk Poley <[email protected]>
+Henri Vasserman <[email protected]>
+Henrik Forstén <[email protected]>
+Herman Semenov <[email protected]>
+Hesen Peng <[email protected]>
+Hoang Nguyen <[email protected]>
+Hong Bo PENG <[email protected]>
+Hongyu Ouyang <[email protected]>
+Howard Su <[email protected]>
+Hua Jiang <[email protected]>
+Huawei Lin <[email protected]>
+Hugo Roussel <[email protected]>
+Ian Bull <[email protected]>
+Ian Bull <[email protected]>
+Ian Scrivener <[email protected]>
+Ido S <[email protected]>
+IgnacioFDM <[email protected]>
+Igor Okulist <[email protected]>
+Ikko Eltociear Ashimine <[email protected]>
+Ilya Kurdyukov <[email protected]>
+Ionoclast Laboratories <[email protected]>
+Isaac McFadyen <[email protected]>
+IsaacDynamo <[email protected]>
+Ivan Komarov <[email protected]>
+Ivan Stepanov <[email protected]>
+JH23X <[email protected]>
+Jack Mousseau <[email protected]>
+JackJollimore <[email protected]>
+Jaemin Son <[email protected]>
+Jag Chadha <[email protected]>
+Jakub N <[email protected]>
+James A Capozzoli <[email protected]>
+James Reynolds <[email protected]>
+Jan Boon <[email protected]>
+Jan Boon <[email protected]>
+Jan Ploski <[email protected]>
+Jannis Schönleber <[email protected]>
+Jared Van Bortel <[email protected]>
+Jared Van Bortel <[email protected]>
+Jason McCartney <[email protected]>
+Jean-Christophe Hoelt <[email protected]>
+Jean-Michaël Celerier <[email protected]>
+Jed Fox <[email protected]>
+Jeffrey Quesnelle <[email protected]>
+Jesse Jojo Johnson <[email protected]>
+Jeximo <[email protected]>
+Jhen-Jie Hong <[email protected]>
+Jiahao Li <[email protected]>
+Jian Liao <[email protected]>
+JidongZhang-THU <[email protected]>
+Jinwoo Jeong <[email protected]>
+Jiří Podivín <[email protected]>
+Jiří Sejkora <[email protected]>
+Joan Fontanals <[email protected]>
+Joan Fontanals <[email protected]>
+Johan <[email protected]>
+Johannes Gäßler <[email protected]>
+Johannes Rudolph <[email protected]>
+John <[email protected]>
+John Balis <[email protected]>
+John Smith <[email protected]>
+JohnnyB <[email protected]>
+Jonas Wunderlich <[email protected]>
+Jorge A <[email protected]>
+Jose Maldonado <[email protected]>
+Joseph Stahl <[email protected]>
+Josh Ramer <[email protected]>
+Joyce <[email protected]>
+Juan Calderon-Perez <[email protected]>
+Judd <[email protected]>
+Julius Arkenberg <[email protected]>
+Jun Jie <[email protected]>
+Junyang Lin <[email protected]>
+Juraj Bednar <[email protected]>
+Justin Parker <[email protected]>
+Justin Suess <[email protected]>
+Justina Cho <[email protected]>
+Justine Tunney <[email protected]>
+Justine Tunney <[email protected]>
+Juuso Alasuutari <[email protected]>
+KASR <[email protected]>
+Kamil Tomšík <[email protected]>
+Karsten Weiss <[email protected]>
+Karthick <[email protected]>
+Karthik Kumar Viswanathan <[email protected]>
+Karthik Sethuraman <[email protected]>
+Kasumi <[email protected]>
+Kawrakow <[email protected]>
+Keiichi Tabata <[email protected]>
+Kenvix ⭐ <[email protected]>
+Kerfuffle <[email protected]>
+Kevin Gibbons <[email protected]>
+Kevin Ji <[email protected]>
+Kevin Kwok <[email protected]>
+Kevin Lo <[email protected]>
+Kolen Cheung <[email protected]>
+Konstantin Herud <[email protected]>
+Konstantin Zhuravlyov <[email protected]>
+Kunshang Ji <[email protected]>
+Kyle Liang <[email protected]>
+Kyle Mistele <[email protected]>
+Kylin <[email protected]>
+Lars Grammel <[email protected]>
+Laura <[email protected]>
+Lee <[email protected]>
+Lee Drake <[email protected]>
+Leng Yue <[email protected]>
+Leon Knauer <[email protected]>
+LeonEricsson <[email protected]>
+Leonardo Neumann <[email protected]>
+Li Tan <[email protected]>
+Linwei Wang <[email protected]>
+LoganDark <[email protected]>
+LostRuins <[email protected]>
+Luciano <[email protected]>
+Luo Tian <[email protected]>
+Lyle Dean <[email protected]>
+M. Yusuf Sarıgöz <[email protected]>
+Maarten ter Huurne <[email protected]>
+Mack Straight <[email protected]>
+Maël Kerbiriou <[email protected]>
+MaggotHATE <[email protected]>
+Manuel <[email protected]>
+Marc Köhlbrugge <[email protected]>
+Marco Matthies <[email protected]>
+Marcus Dunn <[email protected]>
+Marian Cepok <[email protected]>
+Mark Fairbairn <[email protected]>
+Marko Tasic <[email protected]>
+Markus Tavenrath <[email protected]>
+Martin Delille <[email protected]>
+Martin Krasser <[email protected]>
+Martin Schwaighofer <[email protected]>
+Marvin Gießing <[email protected]>
+Masaya, Kato <[email protected]>
+MasterYi1024 <[email protected]>
+Mateusz Charytoniuk <[email protected]>
+Matheus C. França <[email protected]>
+Matheus Gabriel Alves Silva <[email protected]>
+Mathieu Nayrolles <[email protected]>
+Mathijs de Bruin <[email protected]>
+Matt Clayton <[email protected]>
+Matt Pulver <[email protected]>
+Matteo Boschini <[email protected]>
+Mattheus Chediak <[email protected]>
+Matthew Tejo <[email protected]>
+Matvey Soloviev <[email protected]>
+Max Krasnyansky <[email protected]>
+Max Krasnyansky <[email protected]>
+Maxime <[email protected]>
+Maximilian Winter <[email protected]>
+Meng Zhang <[email protected]>
+Meng, Hengyu <[email protected]>
+Merrick Christensen <[email protected]>
+Michael Coppola <[email protected]>
+Michael Hueschen <[email protected]>
+Michael Kesper <[email protected]>
+Michael Klimenko <[email protected]>
+Michael Podvitskiy <[email protected]>
+Michael Potter <[email protected]>
+Michael de Gans <[email protected]>
+Michaël de Vries <[email protected]>
+Mihai <[email protected]>
+Mike <[email protected]>
+Mikko Juola <[email protected]>
+Minsoo Cheong <[email protected]>
+Mirko185 <[email protected]>
+Mirror Azure <[email protected]>
+Miwa / Ensan <[email protected]>
+Mohammadreza Hendiani <[email protected]>
+Mohammadreza Hendiani <[email protected]>
+Murilo Santana <[email protected]>
+Musab Gultekin <[email protected]>
+Nam D. Tran <[email protected]>
+Nathan Epstein <[email protected]>
+NawafAlansari <[email protected]>
+Nebula <[email protected]>
+Neo Zhang <[email protected]>
+Neo Zhang <[email protected]>
+Neo Zhang Jianyu <[email protected]>
+Neuman Vong <[email protected]>
+Nexesenex <[email protected]>
+Niall Coates <[email protected]>
+Nicolai Weitkemper <[email protected]>
+Nicolás Pérez <[email protected]>
+Nigel Bosch <[email protected]>
+Niklas Korz <[email protected]>
+Nikolas <[email protected]>
+Nindaleth <[email protected]>
+Oleksandr Nikitin <[email protected]>
+Oleksii Maryshchenko <[email protected]>
+Olivier Chafik <[email protected]>
+Ondřej Čertík <[email protected]>
+Ouadie EL FAROUKI <[email protected]>
+Patrice Ferlet <[email protected]>
+Paul Tsochantaris <[email protected]>
+Pavol Rusnak <[email protected]>
+Pedro Cuenca <[email protected]>
+Peter Sugihara <[email protected]>
+Phil H <[email protected]>
+Philip Taron <[email protected]>
+Phillip Kravtsov <[email protected]>
+Pierre Alexandre SCHEMBRI <[email protected]>
+Pierrick Hymbert <[email protected]>
+Przemysław Pawełczyk <[email protected]>
+Qin Yue Chen <[email protected]>
+Qingyou Meng <[email protected]>
+Qu Zongfu <[email protected]>
+RJ Adriaansen <[email protected]>
+Radoslav Gerganov <[email protected]>
+Radosław Gryta <[email protected]>
+Rahul Vivek Nair <[email protected]>
+Raj Hammeer Singh Hada <[email protected]>
+Ralph Soika <[email protected]>
+Rand Xie <[email protected]>
+Randall Fitzgerald <[email protected]>
+Reinforce-II <[email protected]>
+Ren Xuancheng <[email protected]>
+Rene Leonhardt <[email protected]>
+RhinoDevel <[email protected]>
+Riceball LEE <[email protected]>
+Richard Kiss <[email protected]>
+Richard Roberson <[email protected]>
+Rick G <[email protected]>
+Rickard Edén <[email protected]>
+Rickard Hallerbäck <[email protected]>
+Rickey Bowers Jr <[email protected]>
+Riley Stewart <[email protected]>
+Rinne <[email protected]>
+Rinne <[email protected]>
+Robert Brisita <[email protected]>
+Robert Sung-wook Shin <[email protected]>
+Robey Holderith <[email protected]>
+Robyn <[email protected]>
+Roger Meier <[email protected]>
+Roland <[email protected]>
+Romain D <[email protected]>
+Romain Neutron <[email protected]>
+Roman Parykin <[email protected]>
+Ron Evans <[email protected]>
+Ron Jailall <[email protected]>
+Ronny Brendel <[email protected]>
+Ronsor <[email protected]>
+Rowan Hart <[email protected]>
+Rune <[email protected]>
+Ryan Landay <[email protected]>
+Ryder Wishart <[email protected]>
+Ryuei <[email protected]>
+Rőczey Barnabás <[email protected]>
+SakuraUmi <[email protected]>
+Salvador E. Tropea <[email protected]>
+Sam Spilsbury <[email protected]>
+Sami Farin <[email protected]>
+Samuel Maynard <[email protected]>
+Sang-Kil Park <[email protected]>
+Seb C <[email protected]>
+Sebastián A <[email protected]>
+SebastianApel <[email protected]>
+Senemu <[email protected]>
+Sergey Alirzaev <[email protected]>
+Sergio López <[email protected]>
+Sertaç Özercan <[email protected]>
+SeungWon Jeong <[email protected]>
+ShadovvBeast <[email protected]>
+Shakhar Dasgupta <[email protected]>
+Shangning Xu <[email protected]>
+Shijie <[email protected]>
+Shintarou Okada <[email protected]>
+Shouzheng Liu <[email protected]>
+Shouzheng Liu <[email protected]>
+Shuichi Tsutsumi <[email protected]>
+Sigbjørn Skjæret <[email protected]>
+Simon Willison <[email protected]>
+Siwen Yu <[email protected]>
+Sky Yan <[email protected]>
+Slaren <[email protected]>
+Slava Primenko <[email protected]>
+SoftwareRenderer <[email protected]>
+Someone <[email protected]>
+Someone Serge <[email protected]>
+Sourab Mangrulkar <[email protected]>
+Spencer Sutton <[email protected]>
+Srihari-mcw <[email protected]>
+Srinivas Billa <[email protected]>
+Stefan Sydow <[email protected]>
+Steffen Röcker <[email protected]>
+Stephan Walter <[email protected]>
+Stephen Nichols <[email protected]>
+Steve Grubb <[email protected]>
+Steven Prichard <[email protected]>
+Steven Roussey <[email protected]>
+Steward Garcia <[email protected]>
+Suaj Carrot <[email protected]>
+SuperUserNameMan <[email protected]>
+Tai Duc Nguyen <[email protected]>
+Taikono-Himazin <[email protected]>
+Tameem <[email protected]>
+Tamotsu Takahashi <[email protected]>
+Thái Hoàng Tâm <[email protected]>
+Thatcher Chamberlin <[email protected]>
+Theia Vogel <[email protected]>
+Thérence <[email protected]>
+Thibault Terrasson <[email protected]>
+Thomas Klausner <[email protected]>
+Tim Miller <[email protected]>
+Timmy Knight <[email protected]>
+Timothy Cronin <[email protected]>
+Ting Lou <[email protected]>
+Ting Sun <[email protected]>
+Tobias Lütke <[email protected]>
+Tom C <[email protected]>
+Tom Jobbins <[email protected]>
+Tomas <[email protected]>
+Tomáš Pazdiora <[email protected]>
+Tristan Druyen <[email protected]>
+Tristan Ross <[email protected]>
+Tungsten842 <[email protected]>
+Tungsten842 <[email protected]>
+Tushar <[email protected]>
+UEXTM.com <[email protected]>
+Ulrich Drepper <[email protected]>
+Uzo Nweke <[email protected]>
+Vaibhav Srivastav <[email protected]>
+Val Kharitonov <[email protected]>
+Valentin Konovalov <[email protected]>
+Valentyn Bezshapkin <[email protected]>
+Victor Nogueira <[email protected]>
+Victor Z. Peng <[email protected]>
+Vlad <[email protected]>
+Vladimir <[email protected]>
+Vladimir Malyutin <[email protected]>
+Vladimir Zorin <[email protected]>
+Volodymyr Vitvitskyi <[email protected]>
+WangHaoranRobin <[email protected]>
+Weird Constructor <[email protected]>
+Welby Seely <[email protected]>
+Wentai Zhang <[email protected]>
+WillCorticesAI <[email protected]>
+William Tambellini <[email protected]>
+Willy Tarreau <[email protected]>
+Wouter <[email protected]>
+Wu Jian Ping <[email protected]>
+Wu Jian Ping <[email protected]>
+Xiake Sun <[email protected]>
+Xiang (Kevin) Li <[email protected]>
+Xiao-Yong Jin <[email protected]>
+XiaotaoChen <[email protected]>
+Xiaoyi Chen <[email protected]>
+Xingchen Song(宋星辰) <[email protected]>
+Xuan Son Nguyen <[email protected]>
+Yann Follet <[email protected]>
+Yaroslav <[email protected]>
+Yazan Agha-Schrader <[email protected]>
+Yiming Cui <[email protected]>
+Yishuo Wang <[email protected]>
+Yueh-Po Peng <[email protected]>
+Yui <[email protected]>
+Yusuf Kağan Hanoğlu <[email protected]>
+Yuval Peled <[email protected]>
+ZHAOKAI WANG <[email protected]>
+Zane Shannon <[email protected]>
+Zay <[email protected]>
+Zenix <[email protected]>
+Zhang Peiyuan <[email protected]>
+Zheng.Deng <[email protected]>
+ZhouYuChen <[email protected]>
+Ziad Ben Hadj-Alouane <[email protected]>
+Ziang Wu <[email protected]>
+Zsapi <[email protected]>
+a-n-n-a-l-e-e <[email protected]>
+adel boussaken <[email protected]>
+afrideva <[email protected]>
+agray3 <[email protected]>
+akawrykow <[email protected]>
+alexpinel <[email protected]>
+alonfaraj <[email protected]>
+alwqx <[email protected]>
+amd-lalithnc <[email protected]>
+andrijdavid <[email protected]>
+anon998 <[email protected]>
+anzz1 <[email protected]>
+apaz <[email protected]>
+apcameron <[email protected]>
+arch-btw <[email protected]>
+arcrank <[email protected]>
+arlo-phoenix <[email protected]>
+at8u <[email protected]>
+automaticcat <[email protected]>
+bandoti <[email protected]>
+beiller <[email protected]>
+bhubbb <[email protected]>
+bmwl <[email protected]>
+bobqianic <[email protected]>
+bryanSwk <[email protected]>
+bsilvereagle <[email protected]>
+bssrdf <[email protected]>
+byte-6174 <[email protected]>
+cebtenzzre <[email protected]>
+chaihahaha <[email protected]>
+chiranko <[email protected]>
+clibdev <[email protected]>
+clyang <[email protected]>
+cocktailpeanut <[email protected]>
+coezbek <[email protected]>
+comex <[email protected]>
+compilade <[email protected]>
+compilade <[email protected]>
+cpumaxx <[email protected]>
+crasm <[email protected]>
+crasm <[email protected]>
+daboe01 <[email protected]>
+david raistrick <[email protected]>
+ddh0 <[email protected]>
+ddpasa <[email protected]>
+deepdiffuser <[email protected]>
+divinity76 <[email protected]>
+dm4 <[email protected]>
+dotpy314 <[email protected]>
+drbh <[email protected]>
+ds5t5 <[email protected]>
+dylan <[email protected]>
+eastriver <[email protected]>
+ebraminio <[email protected]>
+eiery <[email protected]>
+eric8607242 <[email protected]>
+fairydreaming <[email protected]>
+fraxy-v <[email protected]>
+github-actions[bot] <github-actions[bot]@users.noreply.github.com>
+gliptic <[email protected]>
+goerch <[email protected]>
+grahameth <[email protected]>
+gwjr <[email protected]>
+h-h-h-h <[email protected]>
+hankcs <[email protected]>
+hoangmit <[email protected]>
+hongbo.mo <[email protected]>
+hopkins385 <[email protected]>
+howlger <[email protected]>
+howlger <[email protected]>
+hutli <[email protected]>
+hutli <[email protected]>
+hutli <[email protected]>
+hxer7963 <[email protected]>
+hydai <[email protected]>
+iSma <[email protected]>
+iacore <[email protected]>
+igarnier <[email protected]>
+intelmatt <[email protected]>
+iohub <[email protected]>
+jacobi petrucciani <[email protected]>
+jaime-m-p <[email protected]>
+jameswu2014 <[email protected]>
+jiez <[email protected]>
+jneem <[email protected]>
+joecryptotoo <[email protected]>
+johnson442 <[email protected]>
+jojorne <[email protected]>
+jon-chuang <[email protected]>
+jp-x-g <[email protected]>
+jukofyork <[email protected]>
+junchao-loongson <[email protected]>
+jwj7140 <[email protected]>
+k.h.lai <[email protected]>
+kaizau <[email protected]>
+kalomaze <[email protected]>
+kang <[email protected]>
+katsu560 <[email protected]>
+kchro3 <[email protected]>
+khimaros <[email protected]>
+kiltyj <[email protected]>
+klosax <[email protected]>
+kunal-vaishnavi <[email protected]>
+kunnis <[email protected]>
+kuronekosaiko <[email protected]>
+kuvaus <[email protected]>
+kwin1412 <[email protected]>
+l3utterfly <[email protected]>
+ldwang <[email protected]>
+le.chang <[email protected]>
+leejet <[email protected]>
+limitedAtonement <[email protected]>
+liuwei-git <[email protected]>
+lon <[email protected]>
+loonerin <[email protected]>
+luoyu-intel <[email protected]>
+m3ndax <[email protected]>
+maddes8cht <[email protected]>
+makomk <[email protected]>
+manikbhandari <[email protected]>
+maor-ps <[email protected]>
+mdrokz <[email protected]>
+mgroeber9110 <[email protected]>
+minarchist <[email protected]>
+mj-shifu <[email protected]>
+mmyjona <[email protected]>
+momonga <[email protected]>
+moritzbrantner <[email protected]>
+mzcu <[email protected]>
+nanahi <[email protected]>
+ngc92 <[email protected]>
+nhamanasu <[email protected]>
+niansa/tuxifan <[email protected]>
+niansa/tuxifan <[email protected]>
+nickp27 <[email protected]>
+ningshanwutuobang <[email protected]>
+nold <[email protected]>
+nopperl <[email protected]>
+nusu-github <[email protected]>
+olexiyb <[email protected]>
+omahs <[email protected]>
+oobabooga <[email protected]>
+opparco <[email protected]>
+ostix360 <[email protected]>
+pengxin99 <[email protected]>
+perserk <[email protected]>
+pmysl <[email protected]>
+postmasters <[email protected]>
+pudepiedj <[email protected]>
+qingfengfenga <[email protected]>
+qouoq <[email protected]>
+qunash <[email protected]>
+rabidcopy <[email protected]>
+rankaiyx <[email protected]>
+rhjdvsgsgks <[email protected]>
+rhuddleston <[email protected]>
+rimoliga <[email protected]>
+runfuture <[email protected]>
+sandyiscool <[email protected]>
+sasha0552 <[email protected]>
+semidark <[email protected]>
+sharpHL <[email protected]>
+shibe2 <[email protected]>
+singularity <[email protected]>
+sjinzh <[email protected]>
+sjxx <[email protected]>
+slaren <[email protected]>
+slaren <[email protected]>
+snadampal <[email protected]>
+staviq <[email protected]>
+stduhpf <[email protected]>
+strawberrymelonpanda <[email protected]>
+swittk <[email protected]>
+takov751 <[email protected]>
+tarcey <[email protected]>
+texmex76 <[email protected]>
+thement <[email protected]>
+tjohnman <[email protected]>
+tslmy <[email protected]>
+ubik2 <[email protected]>
+uint256_t <[email protected]>
+uint256_t <[email protected]>
+unbounded <[email protected]>
+valiray <[email protected]>
+vik <[email protected]>
+viric <[email protected]>
+vodkaslime <[email protected]>
+vvhg1 <[email protected]>
+vxiiduu <[email protected]>
+wbpxre150 <[email protected]>
+whoreson <[email protected]>
+woachk <[email protected]>
+wonjun Jang <[email protected]>
+woodx <[email protected]>
+wzy <[email protected]>
+xaedes <[email protected]>
+xaedes <[email protected]>
+xloem <[email protected]>
+yangli2 <[email protected]>
+yuiseki <[email protected]>
+zakkor <[email protected]>
+zhangkaihuo <[email protected]>
+zhouwg <[email protected]>
+zhouwg <[email protected]>
+zrm <[email protected]>
+Ștefan-Gabriel Muscalu <[email protected]>
+源文雨 <[email protected]>
+Нияз Гарифзянов <[email protected]>

CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
+project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)
+#set(CMAKE_WARN_DEPRECATED YES)
+set(CMAKE_WARN_UNUSED_CLI YES)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
+endif()
+# Add path to modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(LLAMA_STANDALONE ON)
+    include(git-vars)
+    # configure project version
+    # TODO
+else()
+    set(LLAMA_STANDALONE OFF)
+endif()
+if (EMSCRIPTEN)
+    set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
+else()
+    if (MINGW)
+        set(BUILD_SHARED_LIBS_DEFAULT OFF)
+    else()
+        set(BUILD_SHARED_LIBS_DEFAULT ON)
+    endif()
+endif()
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
+if (WIN32)
+    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
+endif()
+#
+# option list
+#
+# debug
+option(LLAMA_ALL_WARNINGS           "llama: enable all compiler warnings"                   ON)
+option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
+# build
+option(LLAMA_FATAL_WARNINGS "llama: enable -Werror flag" OFF)
+# sanitizers
+option(LLAMA_SANITIZE_THREAD    "llama: enable thread sanitizer"    OFF)
+option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
+option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
+# utils
+option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
+# extra artifacts
+option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
+option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
+# 3rd party libs
+option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
+# Required for relocatable CMake package
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
+# override ggml options
+set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
+set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
+set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
+set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
+# change the default for these ggml options
+if (NOT DEFINED GGML_LLAMAFILE)
+    set(GGML_LLAMAFILE_DEFAULT ON)
+endif()
+if (NOT DEFINED GGML_AMX)
+    set(GGML_AMX ON)
+endif()
+if (NOT DEFINED GGML_CUDA_GRAPHS)
+    set(GGML_CUDA_GRAPHS_DEFAULT ON)
+endif()
+# transition helpers
+function (llama_option_depr TYPE OLD NEW)
+    if (${OLD})
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
+        set(${NEW} ON PARENT_SCOPE)
+    endif()
+endfunction()
+llama_option_depr(FATAL_ERROR LLAMA_CUBLAS              GGML_CUDA)
+llama_option_depr(WARNING     LLAMA_CUDA                GGML_CUDA)
+llama_option_depr(WARNING     LLAMA_KOMPUTE             GGML_KOMPUTE)
+llama_option_depr(WARNING     LLAMA_METAL               GGML_METAL)
+llama_option_depr(WARNING     LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
+llama_option_depr(WARNING     LLAMA_NATIVE              GGML_NATIVE)
+llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
+llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
+llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
+llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+#
+# build the library
+#
+if (NOT TARGET ggml)
+    add_subdirectory(ggml)
+    # ... otherwise assume ggml is added by a parent CMakeLists.txt
+endif()
+add_subdirectory(src)
+#
+# install
+#
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+set(LLAMA_BUILD_NUMBER        ${BUILD_NUMBER})
+set(LLAMA_BUILD_COMMIT        ${BUILD_COMMIT})
+set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
+set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
+set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
+set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
+# At the moment some compile definitions are placed within the ggml/src
+# directory but not exported on the `ggml` target. This could be improved by
+# determining _precisely_ which defines are necessary for the llama-config
+# package.
+#
+set(GGML_TRANSIENT_DEFINES)
+get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
+get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
+if (GGML_DIR_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_DIR_DEFINES})
+endif()
+get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
+if (GGML_TARGET_DEFINES)
+    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
+endif()
+get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
+set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+install(TARGETS llama LIBRARY PUBLIC_HEADER)
+configure_package_config_file(
+        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/llama-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama
+    PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
+              LLAMA_LIB_INSTALL_DIR
+              LLAMA_BIN_INSTALL_DIR )
+write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
+    VERSION ${LLAMA_INSTALL_VERSION}
+    COMPATIBILITY SameMajorVersion)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/llama-config.cmake
+              ${CMAKE_CURRENT_BINARY_DIR}/llama-version.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/llama)
+install(
+    FILES convert_hf_to_gguf.py
+    PERMISSIONS
+        OWNER_READ
+        OWNER_WRITE
+        OWNER_EXECUTE
+        GROUP_READ
+        GROUP_EXECUTE
+        WORLD_READ
+        WORLD_EXECUTE
+    DESTINATION ${CMAKE_INSTALL_BINDIR})
+configure_file(cmake/llama.pc.in
+        "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+        @ONLY)
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+        DESTINATION lib/pkgconfig)
+#
+# utils, programs, examples and tests
+#
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
+endif()
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+    include(CTest)
+    add_subdirectory(tests)
+endif()
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+    add_subdirectory(pocs)
+endif()

CMakePresets.json ADDED Viewed

	@@ -0,0 +1,81 @@

+{
+  "version": 4,
+  "configurePresets": [
+    {
+        "name":  "base",
+        "hidden": true,
+        "generator":   "Ninja",
+        "binaryDir":   "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+    {
+        "name": "sycl-base",
+        "hidden": true,
+        "generator": "Ninja",
+        "binaryDir": "${sourceDir}/build-${presetName}",
+        "cacheVariables": {
+            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+            "CMAKE_CXX_COMPILER": "icx",
+            "CMAKE_C_COMPILER": "cl",
+            "GGML_SYCL": "ON",
+            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
+        }
+    },
+    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
+    {
+        "name": "arm64-windows-msvc", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
+        }
+    },
+    {
+        "name": "arm64-windows-llvm", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
+        }
+    },
+    {
+        "name": "arm64-apple-clang", "hidden": true,
+        "architecture": { "value": "arm64",    "strategy": "external" },
+        "toolset":      { "value": "host=x64", "strategy": "external" },
+        "cacheVariables": {
+            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
+        }
+    },
+    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+    { "name": "arm64-apple-clang-debug"  , "inherits": [ "base", "arm64-apple-clang",  "debug"   ] },
+    { "name": "arm64-apple-clang-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg"   ] },
+    { "name": "arm64-apple-clang+static-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
+    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
+    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
+    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
+    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
+    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
+    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
+    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
+  ]
+}

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# Pull requests (for contributors)
+- Test your changes:
+  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
+  - Execute [the full CI locally on your machine](ci/README.md) before publishing
+- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
+- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
+- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
+# Pull requests (for collaborators)
+- Squash-merge PRs
+- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
+- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+# Coding guidelines
+- Avoid adding third-party dependencies, extra files, extra headers, etc.
+- Always consider cross-compatibility with other operating systems and architectures
+- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
+- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
+- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
+- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
+- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+![matmul](media/matmul.png)
+# Resources
+The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
+https://github.com/ggerganov/llama.cpp/projects

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023-2024 The ggml authors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Makefile ADDED Viewed

	@@ -0,0 +1,1702 @@

+# Define the default target now so that it is always the first target
+BUILD_TARGETS = \
+	libllava.a \
+	llama-batched \
+	llama-batched-bench \
+	llama-bench \
+	llama-cli \
+	llama-convert-llama2c-to-ggml \
+	llama-embedding \
+	llama-eval-callback \
+	llama-export-lora \
+	llama-gbnf-validator \
+	llama-gguf \
+	llama-gguf-hash \
+	llama-gguf-split \
+	llama-gritlm \
+	llama-imatrix \
+	llama-infill \
+	llama-llava-cli \
+	llama-minicpmv-cli\
+	llama-lookahead \
+	llama-lookup \
+	llama-lookup-create \
+	llama-lookup-merge \
+	llama-lookup-stats \
+	llama-parallel \
+	llama-passkey \
+	llama-perplexity \
+	llama-q8dot \
+	llama-quantize \
+	llama-quantize-stats \
+	llama-retrieval \
+	llama-save-load-state \
+	llama-server \
+	llama-simple \
+	llama-simple-chat \
+	llama-speculative \
+	llama-tokenize \
+	llama-vdot \
+	llama-cvector-generator \
+	llama-gen-docs \
+	tests/test-c.o
+# Binaries only useful for tests
+TEST_TARGETS = \
+	tests/test-arg-parser \
+	tests/test-autorelease \
+	tests/test-backend-ops \
+	tests/test-chat-template \
+	tests/test-double-float \
+	tests/test-grad0 \
+	tests/test-grammar-integration \
+	tests/test-grammar-parser \
+	tests/test-json-schema-to-grammar \
+	tests/test-llama-grammar \
+	tests/test-log \
+	tests/test-model-load-cancel \
+	tests/test-quantize-fns \
+	tests/test-quantize-perf \
+	tests/test-rope \
+	tests/test-sampling \
+	tests/test-tokenizer-0 \
+	tests/test-tokenizer-1-bpe \
+	tests/test-tokenizer-1-spm
+#	tests/test-opt \
+# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
+LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
+	retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
+# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
+#  We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
+LEGACY_TARGETS_BUILD = main quantize perplexity embedding server
+# Deprecation aliases
+ifdef LLAMA_CUBLAS
+$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
+endif
+ifdef LLAMA_CUDA
+GGML_CUDA := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_KOMPUTE
+GGML_KOMPUTE := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_METAL
+GGML_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_RPC
+GGML_RPC := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_SYCL
+GGML_SYCL := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_SYCL_F16
+GGML_SYCL_F16 := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_OPENBLAS
+GGML_OPENBLAS := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_OPENBLAS64
+GGML_OPENBLAS64 := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_BLIS
+GGML_BLIS := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_NO_LLAMAFILE
+GGML_NO_LLAMAFILE := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_NO_ACCELERATE
+GGML_NO_ACCELERATE := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_NO_OPENMP
+GGML_NO_OPENMP := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_NO_METAL
+GGML_NO_METAL := 1
+DEPRECATE_WARNING := 1
+endif
+ifdef LLAMA_DISABLE_LOGS
+REMOVE_WARNING := 1
+endif
+ifdef LLAMA_SERVER_VERBOSE
+REMOVE_WARNING := 1
+endif
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+ifndef UNAME_P
+UNAME_P := $(shell uname -p)
+endif
+ifndef UNAME_M
+UNAME_M := $(shell uname -m)
+endif
+# In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
+# of non-gcc compilers don't have to provide g++ alias or wrapper.
+DEFCC  := cc
+DEFCXX := c++
+ifeq ($(origin CC),default)
+CC  := $(DEFCC)
+endif
+ifeq ($(origin CXX),default)
+CXX := $(DEFCXX)
+endif
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+ifeq ($(UNAME_S),Darwin)
+	ifndef GGML_NO_METAL
+		GGML_METAL := 1
+	endif
+	GGML_NO_OPENMP := 1
+	ifneq ($(UNAME_P),arm)
+		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
+		ifeq ($(SYSCTL_M),1)
+			# UNAME_P := arm
+			# UNAME_M := arm64
+			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
+		endif
+	endif
+endif
+ifdef GGML_METAL
+	GGML_METAL_EMBED_LIBRARY := 1
+endif
+ifdef GGML_RPC
+	BUILD_TARGETS += rpc-server
+endif
+ifdef GGML_VULKAN
+	BUILD_TARGETS += vulkan-shaders-gen
+endif
+default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
+test: $(TEST_TARGETS)
+	@failures=0; \
+	for test_target in $(TEST_TARGETS); do \
+		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
+			continue; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
+			continue; \
+		else \
+			echo "Running test $$test_target..."; \
+			./$$test_target; \
+		fi; \
+		if [ $$? -ne 0 ]; then \
+			printf 'Test %s FAILED!\n\n' $$test_target; \
+			failures=$$(( failures + 1 )); \
+		else \
+			printf 'Test %s passed.\n\n' $$test_target; \
+		fi; \
+	done; \
+	if [ $$failures -gt 0 ]; then \
+		printf '\n%s tests failed.\n' $$failures; \
+		exit 1; \
+	fi
+	@echo 'All tests passed.'
+all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
+ifdef RISCV_CROSS_COMPILE
+CC	:= riscv64-unknown-linux-gnu-gcc
+CXX	:= riscv64-unknown-linux-gnu-g++
+endif
+#
+# Compile flags
+#
+# keep standard at C11 and C++11
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CFLAGS    = -std=c11   -fPIC
+MK_CXXFLAGS  = -std=c++11 -fPIC
+MK_NVCCFLAGS = -std=c++11
+ifdef LLAMA_NO_CCACHE
+GGML_NO_CCACHE := 1
+DEPRECATE_WARNING := 1
+endif
+ifndef GGML_NO_CCACHE
+CCACHE := $(shell which ccache)
+ifdef CCACHE
+export CCACHE_SLOPPINESS = time_macros
+$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
+CC    := $(CCACHE) $(CC)
+CXX   := $(CCACHE) $(CXX)
+else
+$(info I ccache not found. Consider installing it for faster compilation.)
+endif # CCACHE
+endif # GGML_NO_CCACHE
+# clock_gettime came in POSIX.1b (1993)
+# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
+# posix_memalign came in POSIX.1-2001 / SUSv3
+# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
+MK_CPPFLAGS += -D_XOPEN_SOURCE=600
+# Somehow in OpenBSD whenever POSIX conformance is specified
+# some string functions rely on locale_t availability,
+# which was introduced in POSIX.1-2008, forcing us to go higher
+ifeq ($(UNAME_S),OpenBSD)
+	MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
+endif
+# Data types, macros and functions related to controlling CPU affinity and
+# some memory allocation are available on Linux through GNU extensions in libc
+ifeq ($(UNAME_S),Linux)
+	MK_CPPFLAGS += -D_GNU_SOURCE
+endif
+# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+# and on macOS its availability depends on enabling Darwin extensions
+# similarly on DragonFly, enabling BSD extensions is necessary
+ifeq ($(UNAME_S),Darwin)
+	MK_CPPFLAGS += -D_DARWIN_C_SOURCE
+endif
+ifeq ($(UNAME_S),DragonFly)
+	MK_CPPFLAGS += -D__BSD_VISIBLE
+endif
+# alloca is a non-standard interface that is not visible on BSDs when
+# POSIX conformance is specified, but not all of them provide a clean way
+# to enable it in such cases
+ifeq ($(UNAME_S),FreeBSD)
+	MK_CPPFLAGS += -D__BSD_VISIBLE
+endif
+ifeq ($(UNAME_S),NetBSD)
+	MK_CPPFLAGS += -D_NETBSD_SOURCE
+endif
+ifeq ($(UNAME_S),OpenBSD)
+	MK_CPPFLAGS += -D_BSD_SOURCE
+endif
+ifdef GGML_SCHED_MAX_COPIES
+	MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(GGML_SCHED_MAX_COPIES)
+endif
+ifdef LLAMA_DEBUG
+	MK_CFLAGS    += -O0 -g
+	MK_CXXFLAGS  += -O0 -g
+	MK_LDFLAGS   += -g
+	MK_NVCCFLAGS += -O0 -g
+	ifeq ($(UNAME_S),Linux)
+		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
+	endif
+else
+	MK_CPPFLAGS   += -DNDEBUG
+	MK_CFLAGS     += -O3 -g
+	MK_CXXFLAGS   += -O3 -g
+	MK_NVCCFLAGS  += -O3 -g
+endif
+ifdef LLAMA_SANITIZE_THREAD
+	MK_CFLAGS   += -fsanitize=thread -g
+	MK_CXXFLAGS += -fsanitize=thread -g
+	MK_LDFLAGS  += -fsanitize=thread -g
+endif
+ifdef LLAMA_SANITIZE_ADDRESS
+	MK_CFLAGS   += -fsanitize=address -fno-omit-frame-pointer -g
+	MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g
+	MK_LDFLAGS  += -fsanitize=address -fno-omit-frame-pointer -g
+endif
+ifdef LLAMA_SANITIZE_UNDEFINED
+	MK_CFLAGS   += -fsanitize=undefined -g
+	MK_CXXFLAGS += -fsanitize=undefined -g
+	MK_LDFLAGS  += -fsanitize=undefined -g
+endif
+ifdef LLAMA_SERVER_SSL
+	MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
+	MK_LDFLAGS += -lssl -lcrypto
+endif
+# warnings
+WARN_FLAGS = \
+	-Wall \
+	-Wextra \
+	-Wpedantic \
+	-Wcast-qual \
+	-Wno-unused-function
+MK_CFLAGS += \
+	$(WARN_FLAGS) \
+	-Wshadow \
+	-Wstrict-prototypes \
+	-Wpointer-arith \
+	-Wmissing-prototypes \
+	-Werror=implicit-int \
+	-Werror=implicit-function-declaration
+MK_CXXFLAGS += \
+	$(WARN_FLAGS) \
+	-Wmissing-declarations \
+	-Wmissing-noreturn
+ifeq ($(LLAMA_FATAL_WARNINGS),1)
+	MK_CFLAGS   += -Werror
+	MK_CXXFLAGS += -Werror
+endif
+# this version of Apple ld64 is buggy
+ifneq '' '$(findstring dyld-1015.7,$(shell $(CC) $(LDFLAGS) -Wl,-v 2>&1))'
+	MK_CPPFLAGS += -DHAVE_BUGGY_APPLE_LINKER
+endif
+# OS specific
+# TODO: support Windows
+ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
+	MK_CFLAGS   += -pthread
+	MK_CXXFLAGS += -pthread
+endif
+# detect Windows
+ifneq ($(findstring _NT,$(UNAME_S)),)
+	_WIN32 := 1
+endif
+# library name prefix
+ifneq ($(_WIN32),1)
+	LIB_PRE := lib
+endif
+# Dynamic Shared Object extension
+ifneq ($(_WIN32),1)
+	DSO_EXT := .so
+else
+	DSO_EXT := .dll
+endif
+# Windows Sockets 2 (Winsock) for network-capable apps
+ifeq ($(_WIN32),1)
+	LWINSOCK2 := -lws2_32
+endif
+ifdef LLAMA_GPROF
+	MK_CFLAGS   += -pg
+	MK_CXXFLAGS += -pg
+endif
+# Architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+ifndef RISCV_CROSS_COMPILE
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
+	# Use all CPU extensions that are available:
+	MK_CFLAGS     += -march=native -mtune=native
+	HOST_CXXFLAGS += -march=native -mtune=native
+	# Usage AVX-only
+	#MK_CFLAGS   += -mfma -mf16c -mavx
+	#MK_CXXFLAGS += -mfma -mf16c -mavx
+	# Usage SSSE3-only (Not is SSE3!)
+	#MK_CFLAGS   += -mssse3
+	#MK_CXXFLAGS += -mssse3
+endif
+ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
+	# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
+	# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
+	# https://github.com/ggerganov/llama.cpp/issues/2922
+	MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
+	MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
+	# Target Windows 8 for PrefetchVirtualMemory
+	MK_CPPFLAGS += -D_WIN32_WINNT=0x602
+endif
+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	# Apple M1, M2, etc.
+	# Raspberry Pi 3, 4, Zero 2 (64-bit)
+	# Nvidia Jetson
+	MK_CFLAGS   += -mcpu=native
+	MK_CXXFLAGS += -mcpu=native
+	JETSON_RELEASE_INFO = $(shell jetson_release)
+	ifdef JETSON_RELEASE_INFO
+		ifneq ($(filter TX2%,$(JETSON_RELEASE_INFO)),)
+			JETSON_EOL_MODULE_DETECT = 1
+			CC = aarch64-unknown-linux-gnu-gcc
+			cxx = aarch64-unknown-linux-gnu-g++
+		endif
+	endif
+endif
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, Zero
+	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 2
+	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 3, 4, Zero 2 (32-bit)
+	MK_CFLAGS   += -mfp16-format=ieee -mno-unaligned-access
+	MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+ifneq ($(filter ppc64%,$(UNAME_M)),)
+	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
+	ifneq (,$(findstring POWER9,$(POWER9_M)))
+		MK_CFLAGS   += -mcpu=power9
+		MK_CXXFLAGS += -mcpu=power9
+	endif
+endif
+ifneq ($(filter ppc64le%,$(UNAME_M)),)
+	MK_CFLAGS   += -mcpu=powerpc64le
+	MK_CXXFLAGS += -mcpu=powerpc64le
+	CUDA_POWER_ARCH = 1
+endif
+ifneq ($(filter loongarch64%,$(UNAME_M)),)
+	MK_CFLAGS   += -mlasx
+	MK_CXXFLAGS += -mlasx
+endif
+ifneq ($(filter riscv64%,$(UNAME_M)),)
+	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
+	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+else # RISC-V CROSS COMPILATION
+	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
+	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
+endif
+ifndef GGML_NO_ACCELERATE
+	# Mac OS - include Accelerate framework.
+	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
+	ifeq ($(UNAME_S),Darwin)
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
+		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
+		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
+		MK_LDFLAGS  += -framework Accelerate
+		OBJ_GGML    += ggml/src/ggml-blas.o
+	endif
+endif # GGML_NO_ACCELERATE
+ifdef GGML_MUSA
+	CC := clang
+	CXX := clang++
+	GGML_CUDA := 1
+	MK_CPPFLAGS += -DGGML_USE_MUSA
+endif
+ifndef GGML_NO_OPENMP
+	MK_CPPFLAGS += -DGGML_USE_OPENMP
+	MK_CFLAGS   += -fopenmp
+	MK_CXXFLAGS += -fopenmp
+	ifdef GGML_MUSA
+		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
+		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
+	endif # GGML_MUSA
+endif # GGML_NO_OPENMP
+ifdef GGML_OPENBLAS
+	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
+	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_OPENBLAS
+ifdef GGML_OPENBLAS64
+	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
+	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_OPENBLAS64
+ifdef GGML_BLIS
+	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+	MK_LDFLAGS  += -lblis -L/usr/local/lib
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_BLIS
+ifdef GGML_NVPL
+	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+	OBJ_GGML    += ggml/src/ggml-blas.o
+endif # GGML_NVPL
+ifndef GGML_NO_LLAMAFILE
+	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+	OBJ_GGML    += ggml/src/llamafile/sgemm.o
+endif
+ifndef GGML_NO_AMX
+	MK_CPPFLAGS += -DGGML_USE_AMX
+	OBJ_GGML    += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
+endif
+ifdef GGML_RPC
+	MK_CPPFLAGS += -DGGML_USE_RPC
+	OBJ_GGML    += ggml/src/ggml-rpc.o
+endif # GGML_RPC
+OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJ_CUDA_TMPL     += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/mmq*.cu))
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*.cu))
+else
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+	OBJ_CUDA_TMPL += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+endif # GGML_CUDA_FA_ALL_QUANTS
+ifdef GGML_CUDA
+	ifdef GGML_MUSA
+		ifneq ('', '$(wildcard /opt/musa)')
+			CUDA_PATH ?= /opt/musa
+		else
+			CUDA_PATH ?= /usr/local/musa
+		endif
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
+		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
+		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
+	else
+		ifneq ('', '$(wildcard /opt/cuda)')
+			CUDA_PATH ?= /opt/cuda
+		else
+			CUDA_PATH ?= /usr/local/cuda
+		endif
+		MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+		MK_NVCCFLAGS += -use_fast_math
+	endif # GGML_MUSA
+	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML += $(OBJ_CUDA_TMPL)
+ifdef LLAMA_FATAL_WARNINGS
+	MK_NVCCFLAGS += -Werror all-warnings
+endif # LLAMA_FATAL_WARNINGS
+ifndef GGML_MUSA
+ifndef JETSON_EOL_MODULE_DETECT
+	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
+endif # JETSON_EOL_MODULE_DETECT
+endif # GGML_MUSA
+ifdef LLAMA_DEBUG
+	MK_NVCCFLAGS += -lineinfo
+endif # LLAMA_DEBUG
+ifdef GGML_CUDA_DEBUG
+	MK_NVCCFLAGS += --device-debug
+endif # GGML_CUDA_DEBUG
+ifdef GGML_CUDA_NVCC
+	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
+else
+	ifdef GGML_MUSA
+		NVCC = $(CCACHE) mcc
+	else
+		NVCC = $(CCACHE) nvcc
+	endif # GGML_MUSA
+endif # GGML_CUDA_NVCC
+ifdef CUDA_DOCKER_ARCH
+	MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
+else ifndef CUDA_POWER_ARCH
+	MK_NVCCFLAGS += -arch=native
+endif # CUDA_DOCKER_ARCH
+ifdef GGML_CUDA_FORCE_DMMV
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
+endif # GGML_CUDA_FORCE_DMMV
+ifdef GGML_CUDA_FORCE_MMQ
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+ifdef GGML_CUDA_FORCE_CUBLAS
+	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+ifdef GGML_CUDA_DMMV_X
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
+else
+	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
+endif # GGML_CUDA_DMMV_X
+ifdef GGML_CUDA_MMV_Y
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
+else ifdef GGML_CUDA_DMMV_Y
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
+else
+	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
+endif # GGML_CUDA_MMV_Y
+ifdef GGML_CUDA_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_F16
+ifdef GGML_CUDA_DMMV_F16
+	MK_NVCCFLAGS += -DGGML_CUDA_F16
+endif # GGML_CUDA_DMMV_F16
+ifdef GGML_CUDA_KQUANTS_ITER
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
+else
+	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
+endif
+ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
+else
+	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
+endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
+ifdef GGML_CUDA_NO_PEER_COPY
+	MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+ifdef GGML_CUDA_CCBIN
+	MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
+endif # GGML_CUDA_CCBIN
+ifdef GGML_CUDA_FA_ALL_QUANTS
+	MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # GGML_CUDA_FA_ALL_QUANTS
+ifdef JETSON_EOL_MODULE_DETECT
+define NVCC_COMPILE
+	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
+else
+	ifdef GGML_MUSA
+define NVCC_COMPILE
+	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
+endef # NVCC_COMPILE
+	else
+define NVCC_COMPILE
+	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+endef # NVCC_COMPILE
+	endif # GGML_MUSA
+endif # JETSON_EOL_MODULE_DETECT
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(NVCC_COMPILE)
+ggml/src/ggml-cuda.o: \
+	ggml/src/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(NVCC_COMPILE)
+endif # GGML_CUDA
+ifdef GGML_VULKAN
+	MK_CPPFLAGS += -DGGML_USE_VULKAN
+	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
+	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+ifdef GGML_VULKAN_CHECK_RESULTS
+	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
+endif
+ifdef GGML_VULKAN_DEBUG
+	MK_CPPFLAGS  += -DGGML_VULKAN_DEBUG
+endif
+ifdef GGML_VULKAN_MEMORY_DEBUG
+	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
+endif
+ifdef GGML_VULKAN_PERF
+	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
+endif
+ifdef GGML_VULKAN_VALIDATE
+	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
+endif
+ifdef GGML_VULKAN_RUN_TESTS
+	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
+endif
+GLSLC_CMD  = glslc
+_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
+_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
+_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
+_ggml_vk_input_dir = ggml/src/vulkan-shaders
+_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
+$(_ggml_vk_header): $(_ggml_vk_source)
+$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
+	$(_ggml_vk_genshaders_cmd) \
+		--glslc      $(GLSLC_CMD) \
+		--input-dir  $(_ggml_vk_input_dir) \
+		--target-hpp $(_ggml_vk_header) \
+		--target-cpp $(_ggml_vk_source)
+vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+endif # GGML_VULKAN
+ifdef GGML_HIPBLAS
+	ifeq ($(wildcard /opt/rocm),)
+		ROCM_PATH      ?= /usr
+		AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
+	else
+		ROCM_PATH	?= /opt/rocm
+		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
+	endif
+	GGML_CUDA_DMMV_X       ?= 32
+	GGML_CUDA_MMV_Y        ?= 1
+	GGML_CUDA_KQUANTS_ITER ?= 2
+	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
+ifdef GGML_HIP_UMA
+	MK_CPPFLAGS += -DGGML_HIP_UMA
+endif # GGML_HIP_UMA
+	MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
+	MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
+	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
+	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
+	HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
+	HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
+	HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
+ifdef GGML_CUDA_FORCE_DMMV
+	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
+endif # GGML_CUDA_FORCE_DMMV
+ifdef GGML_CUDA_FORCE_MMQ
+	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
+endif # GGML_CUDA_FORCE_MMQ
+ifdef GGML_CUDA_FORCE_CUBLAS
+	HIPFLAGS += -DGGML_CUDA_FORCE_CUBLAS
+endif # GGML_CUDA_FORCE_CUBLAS
+ifdef GGML_CUDA_NO_PEER_COPY
+	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
+endif # GGML_CUDA_NO_PEER_COPY
+	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML += $(OBJ_CUDA_TMPL)
+ggml/src/ggml-cuda.o: \
+	ggml/src/ggml-cuda.cu \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h \
+	ggml/src/ggml-backend-impl.h \
+	ggml/src/ggml-common.h \
+	$(wildcard ggml/src/ggml-cuda/*.cuh)
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+ggml/src/ggml-cuda/%.o: \
+	ggml/src/ggml-cuda/%.cu \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h \
+	ggml/src/ggml-cuda/common.cuh
+	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
+endif # GGML_HIPBLAS
+ifdef GGML_METAL
+	MK_CPPFLAGS += -DGGML_USE_METAL
+	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+	OBJ_GGML	+= ggml/src/ggml-metal.o
+ifdef GGML_METAL_NDEBUG
+	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
+endif
+ifdef GGML_METAL_EMBED_LIBRARY
+	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
+	OBJ_GGML   += ggml/src/ggml-metal-embed.o
+endif
+endif # GGML_METAL
+ifdef GGML_METAL
+ggml/src/ggml-metal.o: \
+	ggml/src/ggml-metal.m \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h
+	$(CC) $(CFLAGS) -c $< -o $@
+ifdef GGML_METAL_EMBED_LIBRARY
+ggml/src/ggml-metal-embed.o: \
+	ggml/src/ggml-metal.metal \
+	ggml/src/ggml-common.h
+	@echo "Embedding Metal library"
+	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
+	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
+	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
+	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
+	@rmdir ${TEMP_ASSEMBLY}
+endif
+endif # GGML_METAL
+OBJ_GGML += \
+	ggml/src/ggml.o \
+	ggml/src/ggml-cpu.o \
+	ggml/src/ggml-alloc.o \
+	ggml/src/ggml-backend.o \
+	ggml/src/ggml-quants.o \
+	ggml/src/ggml-aarch64.o
+OBJ_LLAMA = \
+	src/llama.o \
+	src/llama-vocab.o \
+	src/llama-grammar.o \
+	src/llama-sampling.o \
+	src/unicode.o \
+	src/unicode-data.o
+OBJ_COMMON = \
+	common/common.o \
+	common/arg.o \
+	common/log.o \
+	common/console.o \
+	common/ngram-cache.o \
+	common/sampling.o \
+	common/build-info.o \
+	common/json-schema-to-grammar.o
+OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
+LIB_GGML   = $(LIB_PRE)ggml$(DSO_EXT)
+LIB_GGML_S = $(LIB_PRE)ggml.a
+LIB_LLAMA   = $(LIB_PRE)llama$(DSO_EXT)
+LIB_LLAMA_S = $(LIB_PRE)llama.a
+LIB_COMMON   = $(LIB_PRE)common$(DSO_EXT)
+LIB_COMMON_S = $(LIB_PRE)common.a
+LIB_ALL   = $(LIB_GGML)   $(LIB_LLAMA)   $(LIB_COMMON)
+LIB_ALL_S = $(LIB_GGML_S) $(LIB_LLAMA_S) $(LIB_COMMON_S)
+GF_CC := $(CC)
+include scripts/get-flags.mk
+# combine build flags with cmdline overrides
+override CPPFLAGS  := $(MK_CPPFLAGS) $(CPPFLAGS)
+override CFLAGS    := $(CPPFLAGS) $(MK_CFLAGS) $(GF_CFLAGS) $(CFLAGS)
+BASE_CXXFLAGS      := $(MK_CXXFLAGS) $(CXXFLAGS)
+override CXXFLAGS  := $(BASE_CXXFLAGS) $(HOST_CXXFLAGS) $(GF_CXXFLAGS) $(CPPFLAGS)
+override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
+override LDFLAGS   := $(MK_LDFLAGS) $(LDFLAGS)
+# identify CUDA host compiler
+ifdef GGML_CUDA
+GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
+include scripts/get-flags.mk
+CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
+endif
+ifdef LLAMA_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS  := $(LDFLAGS) -lcurl
+endif
+#
+# Print build information
+#
+$(info I llama.cpp build info: )
+$(info I UNAME_S:   $(UNAME_S))
+$(info I UNAME_P:   $(UNAME_P))
+$(info I UNAME_M:   $(UNAME_M))
+$(info I CFLAGS:    $(CFLAGS))
+$(info I CXXFLAGS:  $(CXXFLAGS))
+$(info I NVCCFLAGS: $(NVCCFLAGS))
+$(info I LDFLAGS:   $(LDFLAGS))
+$(info I CC:        $(shell $(CC)   --version | head -n 1))
+$(info I CXX:       $(shell $(CXX)  --version | head -n 1))
+ifdef GGML_CUDA
+$(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
+CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+ifndef GGML_MUSA
+ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
+ifndef CUDA_DOCKER_ARCH
+ifndef CUDA_POWER_ARCH
+$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus )
+endif # CUDA_POWER_ARCH
+endif # CUDA_DOCKER_ARCH
+endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
+endif # GGML_MUSA
+endif # GGML_CUDA
+$(info )
+ifdef DEPRECATE_WARNING
+$(info !!! DEPRECATION WARNING !!!)
+$(info The following LLAMA_ options are deprecated and will be removed in the future. Use the GGML_ prefix instead)
+$(info   - LLAMA_CUDA)
+$(info   - LLAMA_METAL)
+$(info   - LLAMA_METAL_EMBED_LIBRARY)
+$(info   - LLAMA_OPENMP)
+$(info   - LLAMA_RPC)
+$(info   - LLAMA_SYCL)
+$(info   - LLAMA_SYCL_F16)
+$(info   - LLAMA_OPENBLAS)
+$(info   - LLAMA_OPENBLAS64)
+$(info   - LLAMA_BLIS)
+$(info   - LLAMA_NO_LLAMAFILE)
+$(info   - LLAMA_NO_ACCELERATE)
+$(info   - LLAMA_NO_OPENMP)
+$(info   - LLAMA_NO_METAL)
+$(info   - LLAMA_NO_CCACHE)
+$(info )
+endif
+ifdef REMOVE_WARNING
+$(info !!! REMOVAL WARNING !!!)
+$(info The following LLAMA_ options have been removed and are no longer supported)
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info )
+endif
+#
+# Build libraries
+#
+# ggml
+ggml/src/ggml.o: \
+	ggml/src/ggml.c \
+	ggml/include/ggml.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml/src/ggml-cpu.o: \
+	ggml/src/ggml-cpu.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-common.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml/src/ggml-alloc.o: \
+	ggml/src/ggml-alloc.c \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+ggml/src/ggml-backend.o: \
+	ggml/src/ggml-backend.cpp \
+	ggml/src/ggml-backend-impl.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-backend.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+ggml/src/ggml-quants.o: \
+	ggml/src/ggml-quants.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-quants.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+ggml/src/ggml-aarch64.o: \
+	ggml/src/ggml-aarch64.c \
+	ggml/include/ggml.h \
+	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-common.h
+	$(CC) $(CFLAGS)    -c $< -o $@
+ggml/src/ggml-blas.o: \
+	ggml/src/ggml-blas.cpp \
+	ggml/include/ggml-blas.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+ifndef GGML_NO_LLAMAFILE
+ggml/src/llamafile/sgemm.o: \
+	ggml/src/llamafile/sgemm.cpp \
+	ggml/src/llamafile/sgemm.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_NO_LLAMAFILE
+ifndef GGML_NO_AMX
+ggml/src/ggml-amx.o: \
+	ggml/src/ggml-amx.cpp \
+	ggml/include/ggml-amx.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+ggml/src/ggml-amx/mmq.o: \
+	ggml/src/ggml-amx/mmq.cpp \
+	ggml/src/ggml-amx/mmq.h \
+	ggml/include/ggml.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif
+ifdef GGML_RPC
+ggml/src/ggml-rpc.o: \
+	ggml/src/ggml-rpc.cpp \
+	ggml/include/ggml-rpc.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_RPC
+$(LIB_GGML): \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+$(LIB_GGML_S): \
+	$(OBJ_GGML)
+	ar rcs $(LIB_GGML_S) $^
+# llama
+src/unicode.o: \
+	src/unicode.cpp \
+	src/unicode.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+src/unicode-data.o: \
+	src/unicode-data.cpp \
+	src/unicode-data.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+src/llama.o: \
+	src/llama.cpp \
+	src/llama-impl.h \
+	src/llama-vocab.h \
+	src/llama-grammar.h \
+	src/llama-sampling.h \
+	src/unicode.h \
+	include/llama.h \
+	ggml/include/ggml-cuda.h \
+	ggml/include/ggml-metal.h \
+	ggml/include/ggml.h \
+	ggml/include/ggml-alloc.h \
+	ggml/include/ggml-backend.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+src/llama-vocab.o: \
+	src/llama-vocab.cpp \
+	src/llama-vocab.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+src/llama-grammar.o: \
+	src/llama-grammar.cpp \
+	src/llama-grammar.h \
+	src/llama-impl.h \
+	src/llama-vocab.h \
+	src/llama-sampling.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+src/llama-sampling.o: \
+	src/llama-sampling.cpp \
+	src/llama-sampling.h \
+	src/llama-impl.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(LIB_LLAMA): \
+	$(OBJ_LLAMA) \
+	$(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+$(LIB_LLAMA_S): \
+	$(OBJ_LLAMA)
+	ar rcs $(LIB_LLAMA_S) $^
+# common
+common/common.o: \
+	common/common.cpp \
+	common/common.h \
+	common/console.h \
+	common/sampling.h \
+	common/json.hpp \
+	common/json-schema-to-grammar.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+common/arg.o: \
+	common/arg.cpp \
+	common/arg.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+common/log.o: \
+	common/log.cpp \
+	common/log.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+common/sampling.o: \
+	common/sampling.cpp \
+	common/sampling.h \
+	include/llama.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+common/console.o: \
+	common/console.cpp \
+	common/console.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+common/json-schema-to-grammar.o: \
+	common/json-schema-to-grammar.cpp \
+	common/json-schema-to-grammar.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+common/ngram-cache.o: \
+	common/ngram-cache.cpp \
+	common/ngram-cache.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+$(LIB_COMMON): \
+	$(OBJ_COMMON) \
+	$(LIB_LLAMA) \
+	$(LIB_GGML)
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+$(LIB_COMMON_S): \
+	$(OBJ_COMMON)
+	ar rcs $(LIB_COMMON_S) $^
+clean:
+	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -rvf src/*.o
+	rm -rvf tests/*.o
+	rm -rvf examples/*.o
+	rm -rvf common/*.o
+	rm -rvf *.a
+	rm -rvf *.dll
+	rm -rvf *.so
+	rm -rvf *.dot
+	rm -rvf ggml/*.a
+	rm -rvf ggml/*.dll
+	rm -rvf ggml/*.so
+	rm -vrf ggml/src/*.o
+	rm -rvf ggml/src/llamafile/*.o
+	rm -rvf common/build-info.cpp
+	rm -vrf ggml/src/ggml-metal-embed.metal
+	rm -vrf ggml/src/ggml-cuda/*.o
+	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
+	rm -vrf ggml/src/ggml-amx/*.o
+	rm -rvf $(BUILD_TARGETS)
+	rm -rvf $(TEST_TARGETS)
+	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
+	rm -rvf $(LEGACY_TARGETS_CLEAN)
+	find examples pocs -type f -name "*.o" -delete
+#
+# Examples
+#
+# $< is the first prerequisite, i.e. the source file.
+# Explicitly compile this to an object file so that it can be cached with ccache.
+# The source file is then filtered out from $^ (the list of all prerequisites) and the object file is added instead.
+# Helper function that replaces .c, .cpp, and .cu file endings with .o:
+GET_OBJ_FILE = $(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(patsubst %.cu,%.o,$(1))))
+llama-cli: examples/main/main.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	@echo
+	@echo '====  Run ./llama-cli -h for help.  ===='
+	@echo
+llama-infill: examples/infill/infill.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-simple: examples/simple/simple.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-simple-chat: examples/simple-chat/simple-chat.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-tokenize: examples/tokenize/tokenize.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-batched: examples/batched/batched.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-batched-bench: examples/batched-bench/batched-bench.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-quantize: examples/quantize/quantize.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-quantize-stats: examples/quantize-stats/quantize-stats.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-perplexity: examples/perplexity/perplexity.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-imatrix: examples/imatrix/imatrix.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-embedding: examples/embedding/embedding.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-gritlm: examples/gritlm/gritlm.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-save-load-state: examples/save-load-state/save-load-state.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-gguf: examples/gguf/gguf.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+examples/gguf-hash/deps/sha1/sha1.o: \
+	examples/gguf-hash/deps/sha1/sha1.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+examples/gguf-hash/deps/xxhash/xxhash.o: \
+	examples/gguf-hash/deps/xxhash/xxhash.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+examples/gguf-hash/deps/sha256/sha256.o: \
+	examples/gguf-hash/deps/sha256/sha256.c
+	$(CC) $(CFLAGS) -Iexamples/gguf-hash/deps -c $< -o $@
+llama-gguf-hash: examples/gguf-hash/gguf-hash.cpp examples/gguf-hash/deps/sha1/sha1.o examples/gguf-hash/deps/xxhash/xxhash.o examples/gguf-hash/deps/sha256/sha256.o\
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/gguf-hash/deps -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-gguf-split: examples/gguf-split/gguf-split.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-eval-callback: examples/eval-callback/eval-callback.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-bench: examples/llama-bench/llama-bench.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-export-lora: examples/export-lora/export-lora.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-retrieval: examples/retrieval/retrieval.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-speculative: examples/speculative/speculative.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-parallel: examples/parallel/parallel.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-lookahead: examples/lookahead/lookahead.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-lookup: examples/lookup/lookup.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-lookup-create: examples/lookup/lookup-create.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-lookup-merge: examples/lookup/lookup-merge.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-lookup-stats: examples/lookup/lookup-stats.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-passkey: examples/passkey/passkey.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+ifdef GGML_RPC
+rpc-server: examples/rpc/rpc-server.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # GGML_RPC
+llama-server: \
+	examples/server/server.cpp \
+	examples/server/utils.hpp \
+	examples/server/httplib.h \
+	examples/server/colorthemes.css.hpp \
+	examples/server/style.css.hpp \
+	examples/server/theme-beeninorder.css.hpp \
+	examples/server/theme-ketivah.css.hpp \
+	examples/server/theme-mangotango.css.hpp \
+	examples/server/theme-playground.css.hpp \
+	examples/server/theme-polarnight.css.hpp \
+	examples/server/theme-snowstorm.css.hpp \
+	examples/server/index.html.hpp \
+	examples/server/index-new.html.hpp \
+	examples/server/index.js.hpp \
+	examples/server/completion.js.hpp \
+	examples/server/system-prompts.js.hpp \
+	examples/server/prompt-formats.js.hpp \
+	examples/server/json-schema-to-grammar.mjs.hpp \
+	examples/server/loading.html.hpp \
+	common/json.hpp \
+	common/stb_image.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
+# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`:
+examples/server/%.hpp: examples/server/public/% Makefile
+	@( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \
+		echo "unsigned char $${NAME}[] = {" && \
+		cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \
+		echo "};" && \
+		echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \
+	) > $@
+llama-gen-docs: examples/gen-docs/gen-docs.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+libllava.a: examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	common/stb_image.h \
+	common/base64.hpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -static -fPIC -c $< -o $@ -Wno-cast-qual
+llama-llava-cli: examples/llava/llava-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
+	examples/llava/llava.cpp \
+	examples/llava/llava.h \
+	examples/llava/clip.cpp \
+	examples/llava/clip.h \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
+ifeq ($(UNAME_S),Darwin)
+swift: examples/batched.swift
+	(cd examples/batched.swift; make build)
+endif
+common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh
+	@sh scripts/build-info.sh "$(CC)" > [email protected]
+	@if ! cmp -s [email protected] $@; then \
+		mv [email protected] $@; \
+	else \
+		rm [email protected]; \
+	fi
+common/build-info.o: common/build-info.cpp
+	$(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@
+#
+# Tests
+#
+tests: $(TEST_TARGETS)
+tests/test-arg-parser: tests/test-arg-parser.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-llama-grammar: tests/test-llama-grammar.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-log: tests/test-log.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-grammar-parser: tests/test-grammar-parser.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-grammar-integration: tests/test-grammar-integration.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-double-float: tests/test-double-float.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-grad0: tests/test-grad0.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-opt: tests/test-opt.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-quantize-fns: tests/test-quantize-fns.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-quantize-perf: tests/test-quantize-perf.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-sampling: tests/test-sampling.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-rope: tests/test-rope.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-c.o: tests/test-c.c include/llama.h
+	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
+tests/test-backend-ops: tests/test-backend-ops.cpp \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-model-load-cancel: tests/test-model-load-cancel.cpp tests/get-model.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-autorelease: tests/test-autorelease.cpp tests/get-model.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+tests/test-chat-template: tests/test-chat-template.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+#
+# PoCs
+#
+llama-vdot: pocs/vdot/vdot.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
+	$(OBJ_GGML)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+#
+# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
+#
+# Mark legacy binary targets as .PHONY so that they are always checked.
+.PHONY: main quantize perplexity embedding server
+# Define the object file target
+examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
+#  Eventually we will want to remove these target from building all the time.
+main: examples/deprecation-warning/deprecation-warning.o
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
+server: examples/deprecation-warning/deprecation-warning.o
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
+quantize: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard quantize))
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
+	@echo "  Remove the 'quantize' binary to remove this warning."
+	@echo "#########"
+endif
+perplexity: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard perplexity))
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
+	@echo "  Remove the 'perplexity' binary to remove this warning."
+	@echo "#########"
+endif
+embedding: examples/deprecation-warning/deprecation-warning.o
+ifneq (,$(wildcard embedding))
+	$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
+	@echo "#########"
+	@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
+	@echo "  Remove the 'embedding' binary to remove this warning."
+	@echo "#########"
+endif

Package.swift ADDED Viewed

	@@ -0,0 +1,80 @@

+// swift-tools-version:5.5
+import PackageDescription
+var sources = [
+    "src/llama.cpp",
+    "src/llama-vocab.cpp",
+    "src/llama-grammar.cpp",
+    "src/llama-sampling.cpp",
+    "src/unicode.cpp",
+    "src/unicode-data.cpp",
+    "ggml/src/ggml.c",
+    "ggml/src/ggml-cpu.c",
+    "ggml/src/ggml-alloc.c",
+    "ggml/src/ggml-backend.cpp",
+    "ggml/src/ggml-quants.c",
+    "ggml/src/ggml-aarch64.c",
+]
+var resources: [Resource] = []
+var linkerSettings: [LinkerSetting] = []
+var cSettings: [CSetting] =  [
+    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+    .unsafeFlags(["-fno-objc-arc"]),
+    // NOTE: NEW_LAPACK will required iOS version 16.4+
+    // We should consider add this in the future when we drop support for iOS 14
+    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+    // .define("ACCELERATE_NEW_LAPACK"),
+    // .define("ACCELERATE_LAPACK_ILP64")
+]
+#if canImport(Darwin)
+sources.append("ggml/src/ggml-metal.m")
+resources.append(.process("ggml/src/ggml-metal.metal"))
+linkerSettings.append(.linkedFramework("Accelerate"))
+cSettings.append(
+    contentsOf: [
+        .define("GGML_USE_ACCELERATE"),
+        .define("GGML_USE_METAL")
+    ]
+)
+#endif
+#if os(Linux)
+    cSettings.append(.define("_GNU_SOURCE"))
+#endif
+let package = Package(
+    name: "llama",
+    platforms: [
+        .macOS(.v12),
+        .iOS(.v14),
+        .watchOS(.v4),
+        .tvOS(.v14)
+    ],
+    products: [
+        .library(name: "llama", targets: ["llama"]),
+    ],
+    targets: [
+        .target(
+            name: "llama",
+            path: ".",
+            exclude: [
+               "cmake",
+               "examples",
+               "scripts",
+               "models",
+               "tests",
+               "CMakeLists.txt",
+               "Makefile"
+            ],
+            sources: sources,
+            resources: resources,
+            publicHeadersPath: "spm-headers",
+            cSettings: cSettings,
+            linkerSettings: linkerSettings
+        )
+    ],
+    cxxLanguageStandard: .cxx11
+)

SECURITY.md ADDED Viewed

	@@ -0,0 +1,67 @@

+# Security Policy
+ - [**Using llama.cpp securely**](#using-llamacpp-securely)
+   - [Untrusted models](#untrusted-models)
+   - [Untrusted inputs](#untrusted-inputs)
+   - [Data privacy](#data-privacy)
+   - [Untrusted environments or networks](#untrusted-environments-or-networks)
+   - [Multi-Tenant environments](#multi-tenant-environments)
+ - [**Reporting a vulnerability**](#reporting-a-vulnerability)
+## Using llama.cpp securely
+### Untrusted models
+Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
+*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
+> [!NOTE]
+> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
+### Untrusted inputs
+Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
+For maximum security when handling untrusted inputs, you may need to employ the following:
+* Sandboxing: Isolate the environment where the inference happens.
+* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
+* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
+* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
+    * Validation: Enforce strict rules on allowed characters and data types.
+    * Filtering: Remove potentially malicious scripts or code fragments.
+    * Encoding: Convert special characters into safe representations.
+    * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
+### Data privacy
+To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
+### Untrusted environments or networks
+If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
+* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
+* Encrypt your data if sending it over the network.
+### Multi-Tenant environments
+If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
+1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
+2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
+3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
+4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
+## Reporting a vulnerability
+Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
+<!-- normal version -->
+However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
+Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.

ci/README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+# CI
+In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+https://github.com/ggml-org/ci
+It monitors the `master` branch for new commits and runs the
+[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
+to cover various hardware architectures, including GPU and Apple Silicon instances.
+Collaborators can optionally trigger the CI run by adding the `ggml-ci` keyword to their commit message.
+Only the branches of this repo are monitored for this keyword.
+It is a good practice, before publishing changes to execute the full CI locally on your machine:
+```bash
+mkdir tmp
+# CPU-only build
+bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# with CUDA support
+GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+```

ci/run.sh ADDED Viewed

	@@ -0,0 +1,851 @@

+#!/bin/bash
+#
+# sample usage:
+#
+# mkdir tmp
+#
+# # CPU-only build
+# bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with CUDA support
+# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+# # with VULKAN support
+# GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
+if [ -z "$2" ]; then
+    echo "usage: $0 <output-dir> <mnt-dir>"
+    exit 1
+fi
+mkdir -p "$1"
+mkdir -p "$2"
+OUT=$(realpath "$1")
+MNT=$(realpath "$2")
+rm -f "$OUT/*.log"
+rm -f "$OUT/*.exit"
+rm -f "$OUT/*.md"
+sd=`dirname $0`
+cd $sd/../
+SRC=`pwd`
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
+if [ ! -z ${GG_BUILD_METAL} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+fi
+if [ ! -z ${GG_BUILD_CUDA} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
+fi
+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
+        echo "source /opt/intel/oneapi/setvars.sh"
+        exit 1
+    fi
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
+fi
+if [ ! -z ${GG_BUILD_VULKAN} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
+fi
+## helpers
+# download a file if it does not exist or if it is outdated
+function gg_wget {
+    local out=$1
+    local url=$2
+    local cwd=`pwd`
+    mkdir -p $out
+    cd $out
+    # should not re-download if file is the same
+    wget -nv -N $url
+    cd $cwd
+}
+function gg_printf {
+    printf -- "$@" >> $OUT/README.md
+}
+function gg_run {
+    ci=$1
+    set -o pipefail
+    set -x
+    gg_run_$ci | tee $OUT/$ci.log
+    cur=$?
+    echo "$cur" > $OUT/$ci.exit
+    set +x
+    set +o pipefail
+    gg_sum_$ci
+    ret=$((ret | cur))
+}
+## ci
+# ctest_debug
+function gg_run_ctest_debug {
+    cd ${SRC}
+    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
+    set -e
+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+}
+function gg_sum_ctest_debug {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+# ctest_release
+function gg_run_ctest_release {
+    cd ${SRC}
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    # Check cmake, make and ctest are installed
+    gg_check_build_requirements
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    if [ -z ${GG_BUILD_LOW_PERF} ]; then
+        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    else
+        (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    fi
+    set +e
+}
+function gg_sum_ctest_release {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+# test_scripts_debug
+function gg_run_test_scripts_debug {
+    cd ${SRC}
+    set -e
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    set +e
+}
+function gg_sum_test_scripts_debug {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs test scripts in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+# test_scripts_release
+function gg_run_test_scripts_release {
+    cd ${SRC}
+    set -e
+    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
+    set +e
+}
+function gg_sum_test_scripts_release {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs test scripts in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
+    gg_printf '```\n'
+    gg_printf '\n'
+}
+function gg_get_model {
+    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
+    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
+    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    if [[ -s $gguf_0 ]]; then
+        echo -n "$gguf_0"
+    elif [[ -s $gguf_1 ]]; then
+        echo -n "$gguf_1"
+    elif [[ -s $gguf_2 ]]; then
+        echo -n "$gguf_2"
+    else
+        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
+        exit 1
+    fi
+}
+function gg_run_ctest_with_model_debug {
+    cd ${SRC}
+    local model; model=$(gg_get_model)
+    cd build-ci-debug
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+function gg_run_ctest_with_model_release {
+    cd ${SRC}
+    local model; model=$(gg_get_model)
+    cd build-ci-release
+    set -e
+    (LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
+    set +e
+    cd ..
+}
+function gg_sum_ctest_with_model_debug {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest with model files in debug mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+function gg_sum_ctest_with_model_release {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Runs ctest with model files in release mode\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '```\n'
+    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
+    gg_printf '```\n'
+}
+# open_llama_7b_v2
+function gg_run_open_llama_7b_v2 {
+    cd ${SRC}
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/config.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/tokenizer.model
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/tokenizer_config.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/special_tokens_map.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/pytorch_model.bin.index.json
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00001-of-00002.bin
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
+    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    path_models="../models-mnt/open-llama/7B-v2"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+    wiki_test="${path_wiki}/wiki.test.raw"
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+    set +e
+}
+function gg_sum_open_llama_7b_v2 {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'OpenLLaMA 7B-v2:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+# pythia_1.4b
+function gg_run_pythia_1_4b {
+    cd ${SRC}
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+    path_models="../models-mnt/pythia/1.4B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+    wiki_test_60="${path_wiki}/wiki.test-60.raw"
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    (time ./bin/llama-cli --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+    set +e
+}
+function gg_sum_pythia_1_4b {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Pythia 1.4B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+# pythia_2_8b
+function gg_run_pythia_2_8b {
+    cd ${SRC}
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
+    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+    path_models="../models-mnt/pythia/2.8B"
+    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+    wiki_test="${path_wiki}/wiki.test.raw"
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
+    (time ./bin/llama-cli --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    function check_ppl {
+        qnt="$1"
+        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+            return 20
+        fi
+        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
+        return 0
+    }
+    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+    set +e
+}
+function gg_sum_pythia_2_8b {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Pythia 2.8B:\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+# bge-small
+function gg_run_embd_bge_small {
+    cd ${SRC}
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
+    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
+    gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
+    path_models="../models-mnt/bge-small"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
+    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    set +e
+}
+function gg_sum_embd_bge_small {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'BGE Small (BERT):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+}
+# rerank_tiny
+function gg_run_rerank_tiny {
+    cd ${SRC}
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
+    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
+    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
+    path_models="../models-mnt/rerank-tiny"
+    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+    set -e
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
+    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+    model_f16="${path_models}/ggml-model-f16.gguf"
+    # for this model, the SEP token is "</s>"
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    # sample output
+    # rerank score 0:    0.029
+    # rerank score 1:    0.029
+    # rerank score 2:    0.135
+    # check that the score is in the range [$3, $4]
+    function check_score {
+        qnt="$1"
+        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
+            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
+            return 20
+        fi
+        printf '  - %s @ %s OK\n' "$qnt" "$score"
+        return 0
+    }
+    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
+    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
+    set +e
+}
+function gg_sum_rerank_tiny {
+    gg_printf '### %s\n\n' "${ci}"
+    gg_printf 'Rerank Tiny (Jina):\n'
+    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
+}
+function gg_check_build_requirements {
+    if ! command -v cmake &> /dev/null; then
+        gg_printf 'cmake not found, please install'
+    fi
+    if ! command -v make &> /dev/null; then
+        gg_printf 'make not found, please install'
+    fi
+    if ! command -v ctest &> /dev/null; then
+        gg_printf 'ctest not found, please install'
+    fi
+}
+## main
+export LLAMA_LOG_PREFIX=1
+export LLAMA_LOG_TIMESTAMPS=1
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
+    rm -rf ${SRC}/models-mnt
+    mnt_models=${MNT}/models
+    mkdir -p ${mnt_models}
+    ln -sfn ${mnt_models} ${SRC}/models-mnt
+    # Create a fresh python3 venv and enter it
+    python3 -m venv "$MNT/venv"
+    source "$MNT/venv/bin/activate"
+    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
+    pip install --editable gguf-py --disable-pip-version-check
+fi
+ret=0
+test $ret -eq 0 && gg_run ctest_debug
+test $ret -eq 0 && gg_run ctest_release
+if [ -z ${GG_BUILD_LOW_PERF} ]; then
+    test $ret -eq 0 && gg_run embd_bge_small
+    test $ret -eq 0 && gg_run rerank_tiny
+    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
+        test $ret -eq 0 && gg_run test_scripts_debug
+        test $ret -eq 0 && gg_run test_scripts_release
+    fi
+    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
+        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
+            test $ret -eq 0 && gg_run pythia_1_4b
+        else
+            test $ret -eq 0 && gg_run pythia_2_8b
+            #test $ret -eq 0 && gg_run open_llama_7b_v2
+        fi
+        test $ret -eq 0 && gg_run ctest_with_model_debug
+        test $ret -eq 0 && gg_run ctest_with_model_release
+    fi
+fi
+exit $ret