Spaces:

tomasruiz
/

multimodal-vibe-check

Sleeping

App Files Files Community

tomasruiz commited on Dec 15, 2024

Commit

41d24d2

1 Parent(s): 7475e8c

Include code of llmapp from Github

Browse files

Files changed (46) hide show

.gitignore +163 -0
Dockerfile-llm-app +20 -0
Makefile +2 -0
README.md +22 -2
app.py +0 -4
docker-compose.yml +31 -0
llmlib/README.md +0 -0
llmlib/llmlib/__init__.py +0 -0
llmlib/llmlib/base_llm.py +36 -0
llmlib/llmlib/bundler.py +61 -0
llmlib/llmlib/bundler_request.py +10 -0
llmlib/llmlib/gemini/__init__.py +0 -0
llmlib/llmlib/gemini/media_description.py +162 -0
llmlib/llmlib/llama3/.gitignore +1 -0
llmlib/llmlib/llama3/README.md +5 -0
llmlib/llmlib/llama3/__init__.py +3 -0
llmlib/llmlib/llama3/llama3_vision_8b.py +67 -0
llmlib/llmlib/minicpm.py +105 -0
llmlib/llmlib/model_registry.py +28 -0
llmlib/llmlib/openai/openai_completion.py +78 -0
llmlib/llmlib/phi3/phi3.py +157 -0
llmlib/llmlib/phi3/phi35.py +72 -0
llmlib/llmlib/pixtral_demo.py +26 -0
llmlib/llmlib/rest_api/__init__.py +0 -0
llmlib/llmlib/rest_api/restapi_client.py +97 -0
llmlib/llmlib/rest_api/restapi_server.py +56 -0
llmlib/llmlib/runtime.py +21 -0
llmlib/llmlib/whisper.py +74 -0
llmlib/pyproject.toml +21 -0
login_mask_simple.py +50 -0
readme/llm-app-demo.mp4 +0 -0
requirements.txt +12 -0
rest_api.py +4 -0
st_app.py +97 -0
test-files/.gitignore +5 -0
test-files/toxicity-prompt.txt +1 -0
tests/__init__.py +0 -0
tests/helpers.py +123 -0
tests/test_bundler.py +90 -0
tests/test_gemini.py +25 -0
tests/test_llama3.py +26 -0
tests/test_minicpm.py +16 -0
tests/test_openai.py +49 -0
tests/test_phi_3.py +70 -0
tests/test_rest_api.py +40 -0
tests/test_whisper.py +40 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,163 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+models/
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile-llm-app ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-devel
+WORKDIR /app
+RUN apt-get update
+RUN apt-get install -y build-essential
+RUN apt-get install -y git
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install "llama-cpp-python<=0.2.79.0"
+COPY *.py ./
+ADD llmlib ./llmlib
+RUN pip install -e llmlib
+ADD .streamlit .streamlit
+#CMD [ "python", "--version"]
+# CMD ["nvidia-smi"]
+# CMD ["nvcc", "--version"]
+CMD [ "python", "-m", "streamlit", "run", "st_app.py", "--server.port", "8020"]

Makefile ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ run_rest_api:
2	+ fastapi dev rest_api.py --port 8030

README.md CHANGED Viewed

@@ -5,8 +5,28 @@ colorFrom: red
 colorTo: purple
 sdk: streamlit
 sdk_version: 1.41.1
-app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorTo: purple
 sdk: streamlit
 sdk_version: 1.41.1
+app_file: st_app.py
 pinned: false
 ---
+# LLM Multimodal Vibe-Check
+We use this streamlit app to chat with different multimodal open-source and propietary LLMs. The idea is to quickly assess qualitatively (vibe-check) whether the model understands the nuance of harmful language.
+https://github.com/user-attachments/assets/2fb49053-651c-4cc9-b102-92a392a3c473
+## Run Streamlit App
+In the `docker-compose.yml` file, you will need to change the volume to point to your own huggingface model cache. To run the app, use the following command:
+```bash
+docker compose up videoapp
+```
+### Run Only Inference Server
+```bash
+docker compose up rest_api
+```
+## Structure
+* Each multimodal LLM has a different way of consuming image(s). This codebase unifies the different interfaces e.g. of Phi-3, MinCPM, OpenAI GPT-4o, etc. This is done with a single base class `LLM` (interface) which is then implemented by each concrete model. You can find these implementation in the directory `llmlib/llmlib/`.
+* The open-source implementation are based on the `transformers` library. I have experimented with `vLLM`, but it made the GPU run OOM. More fiddling is needed.
+* I have extracted a REST API using `FastAPI` to decouple the frontend streamlit code from the inference server.
+* The app supports small open-source models atm, because the inference server is running a single 24GB VRAM GPU. We will hopefully scale this backend up soon.

app.py DELETED Viewed

@@ -1,4 +0,0 @@
-import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+x-common-gpu: &common-gpu
+  build:
+    dockerfile: Dockerfile-llm-app
+  environment:
+    - OPENAI_API_KEY=${OPENAI_API_KEY}
+    - HF_HOME=/app/.cache/huggingface
+    - HF_TOKEN=${HF_TOKEN}
+    - LLMS_REST_API_KEY=${LLMS_REST_API_KEY}
+    - BUGSNAG_API_KEY=${BUGSNAG_API_KEY}
+  deploy:
+    resources:
+      reservations:
+        devices:
+          - driver: nvidia
+            count: all
+            capabilities: [gpu]
+  volumes:
+      - /home/tomasruiz/.cache/huggingface:/app/.cache/huggingface
+services:
+  llmapp:
+    <<: *common-gpu
+    ports:
+      - "8020:8020"
+  rest_api:
+    <<: *common-gpu
+    ports:
+      - "8030:8030"
+    command: fastapi run rest_api.py --port 8030
+    hostname: rest_api

llmlib/README.md ADDED Viewed

File without changes

llmlib/llmlib/__init__.py ADDED Viewed

File without changes

llmlib/llmlib/base_llm.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pathlib import Path
+from typing import Literal, Self
+from PIL import Image
+from dataclasses import dataclass
+@dataclass
+class Message:
+    role: Literal["user", "assistant"]
+    msg: str
+    img_name: str | None = None
+    img: Image.Image | None = None
+    @classmethod
+    def from_prompt(cls, prompt: str) -> Self:
+        return cls(role="user", msg=prompt)
+class LLM:
+    model_id: str
+    requires_gpu_exclusively: bool = False
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        raise NotImplementedError
+    def complete_batch(self, batch: list[list[Message]]) -> list[str]:
+        raise NotImplementedError
+    def video_prompt(self, video_path: Path, prompt: str) -> str:
+        raise NotImplementedError
+    @classmethod
+    def get_warnings(cls) -> list[str]:
+        return []

llmlib/llmlib/bundler.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from dataclasses import dataclass, field
+import logging
+from .bundler_request import BundlerRequest
+from .model_registry import ModelEntry, ModelRegistry
+from .base_llm import LLM
+import torch
+import gc
+logger = logging.getLogger(__name__)
+@dataclass
+class Bundler:
+    """Makes sure that only 1 model occupies the GPU at a time."""
+    registry: ModelRegistry = field(default_factory=ModelRegistry)
+    model_on_gpu: LLM | None = None
+    id2_nongpu_model: dict[str, LLM] = field(default_factory=dict)
+    def id_of_model_on_gpu(self) -> str | None:
+        return None if self.model_on_gpu is None else self.model_on_gpu.model_id
+    def get_response(self, req: BundlerRequest) -> str:
+        e: ModelEntry = self.registry.get_entry(model_id=req.model_id)
+        model: LLM = self._get_model_instance(e=e)
+        return model.complete_msgs2(req.msgs)
+    def _get_model_instance(self, e: ModelEntry) -> LLM:
+        if e.clazz.requires_gpu_exclusively:
+            self.set_model_on_gpu(model_id=e.model_id)
+            model: LLM = self.model_on_gpu
+        else:
+            if e.model_id not in self.id2_nongpu_model:
+                self.id2_nongpu_model[e.model_id] = e.ctor()
+            model: LLM = self.id2_nongpu_model[e.model_id]
+        return model
+    def set_model_on_gpu(self, model_id: str) -> None:
+        if (
+            self.id_of_model_on_gpu() is not None
+            and self.id_of_model_on_gpu() == model_id
+        ):
+            return
+        assert model_id in self.registry.all_model_ids()
+        e: ModelEntry = self.registry.get_entry(model_id)
+        if not e.clazz.requires_gpu_exclusively:
+            logger.info(
+                "Model does not require GPU exclusively. Ignoring set_model_on_gpu() call."
+            )
+            return
+        self.clear_model_on_gpu()
+        self.model_on_gpu = e.ctor()
+    def clear_model_on_gpu(self):
+        self.model_on_gpu = None
+        gc.collect()
+        torch.cuda.empty_cache()

llmlib/llmlib/bundler_request.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .base_llm import Message
+from dataclasses import dataclass
+@dataclass
+class BundlerRequest:
+    model_id: str
+    msgs: list[Message]

llmlib/llmlib/gemini/__init__.py ADDED Viewed

File without changes

llmlib/llmlib/gemini/media_description.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+Based on https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/video-understanding
+"""
+from dataclasses import dataclass
+from logging import getLogger
+from pathlib import Path
+from typing import Literal
+from google.cloud import storage
+from google.cloud.storage import transfer_manager
+import proto
+from vertexai.generative_models import (
+    GenerativeModel,
+    Part,
+    HarmCategory,
+    HarmBlockThreshold,
+    GenerationResponse,
+)
+import vertexai
+logger = getLogger(__name__)
+project_id = "css-lehrbereich"  # from google cloud console
+frankfurt = "europe-west3"  # https://cloud.google.com/about/locations#europe
+class Buckets:
+    temp = "css-temp-bucket-for-vertex"
+    output = "css-vertex-output"
+def storage_uri(bucket: str, blob_name: str) -> str:
+    """blob_name starts without a slash"""
+    return "gs://%s/%s" % (bucket, blob_name)
+class Models:
+    gemini_pro = "models/gemini-1.5-pro"
+    gemini_flash = "models/gemini-1.5-flash"
+available_models = [Models.gemini_pro, Models.gemini_flash]
+@dataclass
+class Request:
+    media_files: list[Path]
+    model_name: Literal[Models.gemini_pro, Models.gemini_flash] = Models.gemini_pro
+    prompt: str = "Describe this video in detail."
+    def fetch_media_description(self) -> str:
+        return fetch_media_description(self)
+def fetch_media_description(req: Request) -> str:
+    # TODO: Always delete the video in the end. Perhaps use finally block.
+    blobs = upload_files(files=req.media_files)
+    init_vertex()
+    model = GenerativeModel(req.model_name)
+    prompt = req.prompt
+    logger.info("Calling the Google API. model_name='%s'", req.model_name)
+    contents = [
+        Part.from_uri(storage_uri(Buckets.temp, b.name), mime_type=mime_type(b.name))
+        for b in blobs
+    ]
+    contents.append(prompt)
+    response: GenerationResponse = model.generate_content(
+        contents=contents,
+        generation_config={"temperature": 0.0},
+        safety_settings=block_nothing(),
+    )
+    logger.info("Token usage: %s", proto.Message.to_dict(response.usage_metadata))
+    if len(response.candidates) == 0:
+        raise ResponseRefusedException(
+            "No candidates in response. prompt_feedback='%s'" % response.prompt_feedback
+        )
+    enum = type(response.candidates[0].finish_reason)
+    if response.candidates[0].finish_reason in {enum.SAFETY, enum.PROHIBITED_CONTENT}:
+        raise UnsafeResponseError(safety_ratings=response.candidates[0].safety_ratings)
+    for blob in blobs:
+        blob.delete()
+    logger.info("Deleted %d blob(s)", len(blobs))
+    return response.text
+def init_vertex() -> None:
+    vertexai.init(project=project_id, location=frankfurt)
+def mime_type(file_name: str) -> str:
+    mapping = {
+        ".txt": "text/plain",
+        ".jpg": "image/jpeg",
+        ".png": "image/png",
+        ".flac": "audio/flac",
+        ".mp3": "audio/mpeg",
+        ".mp4": "video/mp4",
+    }
+    for ext, mime in mapping.items():
+        if file_name.endswith(ext):
+            return mime
+    raise ValueError(f"Unknown mime type for file: {file_name}")
+def upload_files(files: list[Path]) -> list[storage.Blob]:
+    logger.info("Uploading %d file(s)", len(files))
+    bucket = _bucket(name=Buckets.temp)
+    files_str = [str(f) for f in files]
+    blobs = [bucket.blob(file.name) for file in files]
+    transfer_manager.upload_many(
+        file_blob_pairs=zip(files_str, blobs),
+        skip_if_exists=True,
+        raise_exception=True,
+    )
+    logger.info("Completed file(s) upload")
+    return blobs
+def _bucket(name: str) -> storage.Bucket:
+    client = storage.Client(project=project_id)
+    return client.bucket(name)
+def upload_single_file(file: Path, bucket: str, blob_name: str) -> storage.Blob:
+    logger.info("Uploading file '%s' to bucket '%s' as '%s'", file, bucket, blob_name)
+    bucket: storage.Bucket = _bucket(name=bucket)
+    blob = bucket.blob(blob_name)
+    if blob.exists():
+        logger.info("Blob '%s' already exists. Overwriting it...", blob_name)
+    blob.upload_from_filename(str(file))
+    return blob
+def block_nothing() -> dict[HarmCategory, HarmBlockThreshold]:
+    return {
+        HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
+        HarmCategory.HARM_CATEGORY_CIVIC_INTEGRITY: HarmBlockThreshold.BLOCK_NONE,
+    }
+class UnsafeResponseError(Exception):
+    def __init__(self, safety_ratings: list) -> None:
+        super().__init__(
+            "The response was blocked by Google due to safety reasons. Categories: %s"
+            % safety_ratings
+        )
+        self.safety_categories = safety_ratings
+class ResponseRefusedException(Exception):
+    pass

llmlib/llmlib/llama3/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ models/

llmlib/llmlib/llama3/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+Installation for the quantized model in `llama_cpp`:
+```shell
+CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCUDA_PATH=/usr/local/cuda-12.5 -DCUDAToolkit_ROOT=/usr/local/cuda-12.5 -DCUDAToolkit_INCLUDE_DIR=/usr/local/cuda-12/include -DCUDAToolkit_LIBRARY_DIR=/usr/local/cuda-12.5/lib64" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
+```

llmlib/llmlib/llama3/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .llama3_vision_8b import LLama3Vision8B
2	+
3	+ __all__ = ["LLama3Vision8B"]

llmlib/llmlib/llama3/llama3_vision_8b.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from llmlib.base_llm import Message
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import BitsAndBytesConfig
+from llmlib.base_llm import LLM
+from PIL import Image
+_model_id = "qresearch/llama-3-vision-alpha-hf"
+class LLama3Vision8B(LLM):
+    model_id = _model_id
+    requires_gpu_exclusively = True
+    def __init__(self):
+        self.model = create_model()
+        self.tokenizer = create_tokenizer()
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        if len(msgs) != 1:
+            raise ValueError(
+                f"model='{_model_id}' supports only one message by the user."
+            )
+        msg = msgs[0]
+        if msg.role != "user":
+            raise ValueError(
+                f"model='{_model_id}' supports only a role=user message, not role={msg.role}."
+            )
+        # 2024-06-20: Model does not accept image=None, therefore we create a small white image
+        if msg.img is None:
+            empty_img = Image.new("RGB", (3, 3), color="white")
+            image = empty_img
+        else:
+            image = msg.img
+        response: str = self.tokenizer.decode(
+            self.model.answer_question(image, msg.msg, self.tokenizer),
+            skip_special_tokens=True,
+        )
+        return response
+    @classmethod
+    def get_warnings(cls) -> list[str]:
+        return ["This model only accepts one message by the user at a time."]
+def create_model():
+    bnb_cfg = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.float16,
+        llm_int8_skip_modules=["mm_projector", "vision_model"],
+    )
+    return AutoModelForCausalLM.from_pretrained(
+        _model_id,
+        trust_remote_code=True,
+        torch_dtype=torch.float16,
+        quantization_config=bnb_cfg,
+    )
+def create_tokenizer():
+    return AutoTokenizer.from_pretrained(
+        _model_id,
+        use_fast=True,
+    )

llmlib/llmlib/minicpm.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import logging
+from pathlib import Path
+from typing import Any
+from llmlib.base_llm import LLM, Message
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+from decord import VideoReader, cpu  # pip install decord
+logger = logging.getLogger(__name__)
+_model_name = "openbmb/MiniCPM-V-2_6"
+class MiniCPM(LLM):
+    temperature: float
+    model_id = _model_name
+    requires_gpu_exclusively = True
+    def __init__(self, temperature: float = 0.0, model=None) -> None:
+        if model is None:
+            model = _create_model()
+        self.model = model
+        self.tokenizer = _create_tokenizer()
+        self.temperature = temperature
+    def chat(self, prompt: str) -> str:
+        return self.complete_msgs2([Message(role="user", msg=prompt)])
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        dict_msgs = [_convert_msg_to_dict(m) for m in msgs]
+        use_sampling = self.temperature > 0.0
+        res = self.model.chat(
+            image=None,
+            msgs=dict_msgs,
+            tokenizer=self.tokenizer,
+            sampling=use_sampling,
+            temperature=self.temperature,
+        )
+        return res
+    def video_prompt(self, video_path: Path, prompt: str) -> str:
+        return video_prompt(self, video_path, prompt)
+def _create_tokenizer():
+    return AutoTokenizer.from_pretrained(_model_name, trust_remote_code=True)
+def _create_model():
+    model = AutoModel.from_pretrained(
+        _model_name,
+        trust_remote_code=True,
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval().cuda()
+    return model
+def _convert_msg_to_dict(msg: Message) -> dict:
+    if msg.img is None:
+        content: list[Any] = [msg.msg]
+    else:
+        content = [msg.img.convert("RGB"), msg.msg]
+    return {"role": msg.role, "content": content}
+def to_listof_imgs(video_path: Path) -> list[Image.Image]:
+    """
+    Return one frame per second from the video.
+    If the video is longer than MAX_NUM_FRAMES, sample MAX_NUM_FRAMES frames.
+    """
+    MAX_NUM_FRAMES = 64  # if cuda OOM set a smaller number
+    assert video_path.exists(), video_path
+    vr = VideoReader(str(video_path), ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    imgs = vr.get_batch(frame_idx).asnumpy()
+    imgs = [Image.fromarray(v.astype("uint8")) for v in imgs]
+    return imgs
+def uniform_sample(xs, n):
+    gap = len(xs) / n
+    idxs = [int(i * gap + gap / 2) for i in range(n)]
+    return [xs[i] for i in idxs]
+def video_prompt(self: MiniCPM, video_path: Path, prompt: str) -> str:
+    imgs = to_listof_imgs(video_path)
+    logger.info("Video turned into %d images", len(imgs))
+    msgs = [
+        {"role": "user", "content": [prompt] + imgs},
+    ]
+    # Set decode params for video
+    params = {}
+    params["use_image_id"] = False
+    params["max_slice_nums"] = 2  # use 1 if cuda OOM and video resolution >  448*448
+    answer = self.model.chat(image=None, msgs=msgs, tokenizer=self.tokenizer, **params)
+    return answer

llmlib/llmlib/model_registry.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from typing_extensions import Self
+from dataclasses import dataclass, field
+from typing import Callable
+from .base_llm import LLM
+@dataclass
+class ModelEntry:
+    model_id: str
+    clazz: type[LLM]
+    ctor: Callable[[], LLM]
+    warnings: list[str] = field(default_factory=list)
+    @classmethod
+    def from_cls_with_id(cls, T: type[LLM]) -> Self:
+        return cls(model_id=T.model_id, clazz=T, ctor=T, warnings=T.get_warnings())
+@dataclass
+class ModelRegistry:
+    models: list[ModelEntry] = field(default_factory=list)
+    def get_entry(self, model_id: str) -> ModelEntry:
+        id2entry = {entry.model_id: entry for entry in self.models}
+        return id2entry[model_id]
+    def all_model_ids(self) -> list[str]:
+        return [entry.model_id for entry in self.models]

llmlib/llmlib/openai/openai_completion.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import os
+from PIL.Image import Image
+from ..base_llm import LLM, Message
+from ..rest_api.restapi_client import encode_as_png_in_base64
+from openai import OpenAI, ChatCompletion
+from multiprocessing import Pool
+_default_model = "gpt-4o-mini"
+client = OpenAI()  # must be outside of the class to avoid pickling issues
+class OpenAIModel(LLM):
+    model_ids = [_default_model, "gpt-4o"]
+    def __init__(self, model: str = _default_model):
+        self.model = model
+    def complete(self, prompt: str) -> str:
+        return complete(model=self.model, prompt=prompt)
+    def complete_msgs(self, messages: list[dict], images: list[Image] = []) -> str:
+        return complete_msgs(model=self.model, messages=messages)
+    def complete_many(
+        self, prompts: list[str], n_workers: int = os.cpu_count()
+    ) -> list[str]:
+        return complete_many(model=self.model, prompts=prompts, n_workers=n_workers)
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        messages: list[dict] = extract_msgs(msgs)
+        return self.complete_msgs(messages)
+def complete_many(
+    model: str, prompts: list[str], n_workers: int = os.cpu_count()
+) -> list[str]:
+    print("Calling OpenAI API")
+    with Pool(processes=n_workers) as pool:
+        args = [(model, p) for p in prompts]
+        return pool.starmap(complete, args)
+def complete(model: str, prompt: str) -> str:
+    messages = [{"role": "user", "content": prompt}]
+    return complete_msgs(model=model, messages=messages)
+def complete_msgs(model: str, messages: list[dict]) -> str:
+    completion: ChatCompletion = client.chat.completions.create(
+        model=model, temperature=0.0, messages=messages
+    )
+    assert len(completion.choices) == 1
+    return completion.choices[0].message.content
+def postprocess(response: str) -> str:
+    return response.lower().strip(".").strip()
+def extract_msgs(msgs: list[Message]) -> list[dict]:
+    return [extract_msg(m) for m in msgs]
+def extract_msg(msg: Message) -> dict:
+    if msg.img is None:
+        return {"role": msg.role, "content": msg.msg}
+    img_in_base64 = encode_as_png_in_base64(msg.img)
+    return {
+        "role": msg.role,
+        "content": [
+            {"type": "text", "text": msg.msg},
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{img_in_base64}"},
+            },
+        ],
+    }

llmlib/llmlib/phi3/phi3.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from dataclasses import dataclass
+from typing import Any
+from llmlib.base_llm import Message
+from torch import Tensor
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor
+from PIL import Image
+from llmlib.base_llm import LLM
+from transformers.image_processing_utils import BatchFeature
+model_id = "microsoft/Phi-3-vision-128k-instruct"
+@dataclass
+class GenConf:
+    max_new_tokens: int = 500
+    temperature: float = 0.0
+    def to_dict(self) -> dict[str, Any]:
+        do_sample: bool = self.temperature != 0.0
+        return {
+            "max_new_tokens": self.max_new_tokens,
+            "temperature": self.temperature if do_sample else None,
+            "do_sample": do_sample,
+        }
+class Phi3Vision(LLM):
+    model_id = model_id
+    requires_gpu_exclusively = True
+    def __init__(self, gen_conf: GenConf | None = None):
+        self.model = create_model()
+        self.processor = create_processor()
+        self.gen_conf = GenConf() if gen_conf is None else gen_conf
+    def complete(self, prompt: str) -> str:
+        msg = Message(role="user", msg=prompt)
+        return completion(llm=self, batch=[[msg]])[0]
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        return completion(llm=self, batch=[msgs])[0]
+    def complete_batch(self, batch: list[list[Message]]) -> list[str]:
+        return completion(llm=self, batch=batch)
+def extract_imgs_and_dicts(msgs: list[Message]) -> tuple[list[Image.Image], list[dict]]:
+    """
+    Phi3 expects in the prompts placehodlers for images in the form <|image_X|>, where X is the image number.
+    It also requires the images as a separate array of PIL images.
+    This function extracts the images from the messages and creates the placeholders.
+    It makes sure to avoid duplication in the images and placeholders.
+    """
+    img_names = list(dict.fromkeys(m.img_name for m in msgs if m.img_name is not None))
+    placeholders = {
+        img_name: f"<|image_{i}|>" for i, img_name in enumerate(img_names, 1)
+    }
+    imgs = {}
+    for msg in msgs:
+        if msg.img is not None and msg.img_name not in imgs:
+            imgs[msg.img_name] = msg.img
+    images = list(imgs.values())
+    messages: list[dict] = []  # entries are {"role": str, "content": str}
+    for m in msgs:
+        if m.img is not None and m.img_name is not None:
+            img_placeholder = placeholders[m.img_name]
+            content = f"{img_placeholder}\n{m.msg}"
+        else:
+            content = m.msg
+        messages.append({"role": m.role, "content": content})
+    return images, messages
+def create_model(model_id: str = model_id):
+    return AutoModelForCausalLM.from_pretrained(
+        model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto"
+    )
+def create_processor(model_id: str = model_id):
+    return AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+def convert_to_messages(prompts: list[str]) -> list[list[dict]]:
+    return [[{"role": "user", "content": prompt}] for prompt in prompts]
+def completion(llm: Phi3Vision, batch: list[list[Message]]) -> list[str]:
+    reject_invalid_batches(batch)
+    listof_inputs: list[BatchFeature] = []
+    for messages in batch:
+        images, messages_dicts = extract_imgs_and_dicts(messages)
+        prompt: str = llm.processor.tokenizer.apply_chat_template(
+            messages_dicts, tokenize=False, add_generation_prompt=True
+        )
+        imgs = None if len(images) == 0 else images
+        inputs = llm.processor(prompt, imgs, return_tensors="pt").to("cuda")
+        listof_inputs.append(inputs)
+    pad_token_id = llm.processor.tokenizer.pad_token_id
+    inputs = stack_and_pad_inputs(listof_inputs, pad_token_id=pad_token_id)
+    generate_ids: Tensor = llm.model.generate(
+        **inputs,
+        eos_token_id=llm.processor.tokenizer.eos_token_id,
+        **llm.gen_conf.to_dict(),
+    )
+    # the prompt is included in the output, so we need to drop it.
+    generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
+    responses: list[str] = llm.processor.batch_decode(
+        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    return responses
+def reject_invalid_batches(batch: list[list[Message]]) -> None:
+    """
+    Valid batches are:
+    - batch of lenght 1, or
+    - batches with only a single message per entry, AND
+        - all messages have an image, or
+        - all messages are text only.
+    """
+    if len(batch) <= 1:
+        return
+    if any(len(msgs) != 1 for msgs in batch):
+        raise ValueError("Batch must contain only one message per entry.")
+    any_msg_has_img = any(msg.img is not None for msgs in batch for msg in msgs)
+    any_msg_is_no_img = any(msg.img is None for msgs in batch for msg in msgs)
+    if any_msg_has_img and any_msg_is_no_img:
+        raise ValueError("Batch must contain an image in every entry or none at all.")
+def pad_left(seqs: list[torch.Tensor], pad_token_id: int) -> torch.Tensor:
+    max_len = max(len(seq) for seq in seqs)
+    padded = torch.full((len(seqs), max_len), pad_token_id)
+    for i, seq in enumerate(seqs):
+        padded[i, -len(seq) :] = seq
+    return padded
+def stack_and_pad_inputs(inputs: list[BatchFeature], pad_token_id: int) -> BatchFeature:
+    listof_input_ids = [i.input_ids[0] for i in inputs]
+    new_input_ids = pad_left(listof_input_ids, pad_token_id=pad_token_id)
+    data = dict(
+        input_ids=new_input_ids,
+        attention_mask=(new_input_ids != pad_token_id).long(),
+    )
+    has_imgs: bool = "pixel_values" in inputs[0]
+    if has_imgs:
+        data["pixel_values"] = torch.cat([i.pixel_values for i in inputs], dim=0)
+        data["image_sizes"] = torch.cat([i.image_sizes for i in inputs], dim=0)
+    return BatchFeature(data).to("cuda")

llmlib/llmlib/phi3/phi35.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from PIL import Image
+from llmlib.phi3.phi3 import stack_and_pad_inputs
+import requests
+from transformers import AutoModelForCausalLM
+from transformers import AutoProcessor
+from transformers.image_processing_utils import BatchFeature
+model_id = "microsoft/Phi-3.5-vision-instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="cuda",
+    trust_remote_code=True,
+    torch_dtype="auto",
+    _attn_implementation="flash_attention_2",
+)
+# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True, num_crops=4)
+links = [
+    "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
+    "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-2-2048.jpg",
+    "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-3-2048.jpg",
+]
+images = [Image.open(requests.get(link, stream=True).raw) for link in links]
+batch = [
+    [{"role": "user", "content": "<|image_1|>Who is mentioned in this picture?"}],
+    [{"role": "user", "content": "<|image_1|>What is the title of this image?"}],
+    [{"role": "user", "content": "<|image_1|>What icons are shown in this image?"}],
+]
+# batch = [
+#     [{"role": "user", "content": "What is the capital of France?"}],
+#     [{"role": "user", "content": "How does one make a cookie that is vegetarian?"}],
+# ]
+# images = [None, None]
+# BatchFeature(s) are the output of the processor, which is used as input to the model.
+listof_inputs: list[BatchFeature] = []
+for messages, image in zip(batch, images):
+    prompt = processor.tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    images_ = None if image is None else [image]
+    inputs = processor(prompt, images_, return_tensors="pt").to("cuda:0")
+    listof_inputs.append(inputs)
+inputs = stack_and_pad_inputs(
+    listof_inputs, pad_token_id=processor.tokenizer.pad_token_id
+)
+generation_args = {
+    "max_new_tokens": 1000,
+    "temperature": None,
+    "do_sample": False,
+}
+generate_ids = model.generate(
+    **inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args
+)
+generate_ids = generate_ids[:, inputs["input_ids"].shape[1] :]
+responses: list[str] = processor.batch_decode(
+    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+)
+for p, r in zip(batch, responses):
+    print(p)
+    print(r)
+    print()

llmlib/llmlib/pixtral_demo.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from vllm import LLM
+from vllm.sampling_params import SamplingParams
+if __name__ == "__main__":
+    model_name = "mistralai/Pixtral-12B-2409"
+    sampling_params = SamplingParams(max_tokens=8192)
+    llm = LLM(model=model_name, gpu_memory_utilization=0.1, tokenizer_mode="mistral")
+    prompt = "Describe this image in one sentence."
+    image_url = "https://picsum.photos/id/237/200/300"
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        },
+    ]
+    outputs = llm.chat(messages, sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)

llmlib/llmlib/rest_api/__init__.py ADDED Viewed

File without changes

llmlib/llmlib/rest_api/restapi_client.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import base64
+import io
+import logging
+import os
+import requests
+from PIL import Image
+from ..base_llm import Message
+from ..bundler_request import BundlerRequest
+from pydantic import BaseModel
+from typing import Literal
+logger = logging.getLogger(__name__)
+def encode_as_png_in_base64(img: Image.Image) -> str:
+    stream = io.BytesIO()
+    img.save(stream, format="PNG")
+    return base64.b64encode(stream.getvalue()).decode("utf-8")
+class MsgDto(BaseModel):
+    role: Literal["user", "assistant"]
+    msg: str
+    img_name: str | None = None
+    img_str: str | None = None
+    @classmethod
+    def from_bundler_msg(cls, msg: Message) -> "MsgDto":
+        return cls(
+            role=msg.role,
+            msg=msg.msg,
+            img_name=msg.img_name,
+            img_str=encode_as_png_in_base64(msg.img) if msg.img is not None else None,
+        )
+def to_bundler_msg(msg: MsgDto) -> Message:
+    return Message(
+        role=msg.role,
+        msg=msg.msg,
+        img_name=msg.img_name,
+        img=Image.open(io.BytesIO(base64.b64decode(msg.img_str)))
+        if msg.img_str
+        else None,
+    )
+class RequestDto(BaseModel):
+    model: str
+    msgs: list[MsgDto]
+    @classmethod
+    def from_bundler_request(cls, breq: BundlerRequest) -> "RequestDto":
+        return cls(
+            model=breq.model_id,
+            msgs=[MsgDto.from_bundler_msg(msg) for msg in breq.msgs],
+        )
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "model": "microsoft/Phi-3-vision-128k-instruct",
+                    "msgs": [{"role": "user", "msg": "What is the capital of France?"}],
+                }
+            ]
+        }
+    }
+_api_host = os.environ.get("LLMS_REST_API_HOST", "http://localhost") + ":8030"
+def _headers():
+    return {"X-API-Key": os.environ["LLMS_REST_API_KEY"]}
+def get_completion_from_rest_api(
+    breq: BundlerRequest, source=requests, **kwargs
+) -> requests.Response:
+    req = RequestDto.from_bundler_request(breq)
+    url = _api_host + "/completion/"
+    logger.info(f"Sending completion request to '{url}'.")
+    return source.post(
+        url=url,
+        json=req.model_dump(),
+        headers=_headers(),
+        **kwargs,
+    )
+def get_models(source=requests) -> requests.Response:
+    return source.get(url=_api_host + "/models/", headers=_headers())
+def clear_gpu(source=requests) -> requests.Response:
+    return source.post(url=_api_host + "/clear-gpu/", headers=_headers())

llmlib/llmlib/rest_api/restapi_server.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from fastapi import Depends, FastAPI, HTTPException, Security
+from fastapi.responses import JSONResponse
+from fastapi.security import APIKeyHeader
+from llmlib.bundler import Bundler
+from llmlib.bundler_request import BundlerRequest
+from llmlib.rest_api.restapi_client import RequestDto, to_bundler_msg
+from llmlib.runtime import filled_model_registry
+import os
+import bugsnag
+from bugsnag.asgi import BugsnagMiddleware
+def create_fastapi_app() -> FastAPI:
+    bugsnag.configure(api_key=os.environ["BUGSNAG_API_KEY"])
+    bundler = Bundler(registry=filled_model_registry())
+    app = FastAPI()
+    app.add_middleware(BugsnagMiddleware)
+    header = APIKeyHeader(name="X-API-Key")
+    def is_authorized(api_key: str = Security(header)) -> bool:
+        if api_key != os.environ["LLMS_REST_API_KEY"]:
+            raise HTTPException(status_code=401, detail="Invalid API Key")
+        return True
+    @app.get("/models/")
+    def _(_=Depends(is_authorized)):
+        return bundler.registry.all_model_ids()
+    @app.post("/completion/")
+    def _(req: RequestDto, _=Depends(is_authorized)):
+        breq = BundlerRequest(
+            model_id=req.model, msgs=[to_bundler_msg(msg) for msg in req.msgs]
+        )
+        return {"response": bundler.get_response(breq)}
+    @app.post("/clear-gpu/")
+    def _(_=Depends(is_authorized)):
+        bundler.clear_model_on_gpu()
+        return {"status": "success"}
+    @app.exception_handler(torch.cuda.OutOfMemoryError)
+    def _(req, exc):
+        return JSONResponse(
+            status_code=500,
+            content={
+                "detail": "Error. GPU out of memory. There might be another workload running on the GPU."
+            },
+        )
+    return app

llmlib/llmlib/runtime.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .minicpm import MiniCPM
+from .llama3 import LLama3Vision8B
+from .model_registry import ModelEntry, ModelRegistry
+from .openai.openai_completion import OpenAIModel
+from .phi3.phi3 import Phi3Vision
+def filled_model_registry() -> ModelRegistry:
+    return ModelRegistry(
+        models=[
+            ModelEntry.from_cls_with_id(Phi3Vision),
+            ModelEntry.from_cls_with_id(MiniCPM),
+            ModelEntry.from_cls_with_id(LLama3Vision8B),
+            *[
+                ModelEntry(
+                    model_id=id_, clazz=OpenAIModel, ctor=lambda: OpenAIModel(model=id_)
+                )
+                for id_ in OpenAIModel.model_ids
+            ],
+        ]
+    )

llmlib/llmlib/whisper.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from dataclasses import dataclass, field
+from logging import getLogger
+from typing import Any
+import warnings
+import torch
+from transformers import (
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    pipeline,
+    AutomaticSpeechRecognitionPipeline,
+)
+logger = getLogger(__name__)
+def create_whisper_pipe() -> AutomaticSpeechRecognitionPipeline:
+    device = "cuda"
+    torch_dtype = torch.float16
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_id,
+        torch_dtype=torch_dtype,
+        low_cpu_mem_usage=True,
+        use_safetensors=True,
+        attn_implementation="flash_attention_2",
+    )
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline(
+        "automatic-speech-recognition",
+        model=model,
+        tokenizer=processor.tokenizer,
+        feature_extractor=processor.feature_extractor,
+        torch_dtype=torch_dtype,
+        device=device,
+    )
+    return pipe
+model_id = "openai/whisper-large-v3-turbo"
+@dataclass
+class Whisper:
+    model_id = model_id
+    pipe: AutomaticSpeechRecognitionPipeline = field(
+        default_factory=create_whisper_pipe
+    )
+    def transcribe_file(self, file: str, translate=False) -> str:
+        assert isinstance(file, str)
+        logger.info("Transcribing file: %s", file)
+        try:
+            return self._transcribe(file, translate, return_timestamps=False)
+        except ValueError as e:
+            if "Please either pass `return_timestamps=True`" in repr(e):
+                logger.info("File is >30s, transcribing with timestamps: %s", file)
+                return self._transcribe(file, translate, return_timestamps=True)
+            raise
+    def _transcribe(self, file: str, translate: bool, return_timestamps: bool) -> str:
+        kwargs: dict[str, Any] = {"return_timestamps": return_timestamps}
+        if translate:
+            kwargs["generate_kwargs"] = {"language": "english"}
+        # ignore this warning:
+        # .../site-packages/transformers/models/whisper/generation_whisper.py:496: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.
+        with warnings.catch_warnings(action="ignore", category=FutureWarning):
+            # data["chunks"] contains the timestamped transcriptions
+            data = self.pipe(file, **kwargs)
+        return data["text"].strip()

llmlib/pyproject.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[tool.poetry]
+name = "llmlib"
+version = "0.1.0"
+description = ""
+authors = ["Tomas Ruiz <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "^3.11"
+bugsnag = "^4.7.1"
+decord = "^0.6.0"
+google-cloud-aiplatform = "^1.64"
+# I cannot add the dependencies below. I suspect that torch is a build-time dependency for flash-attn, or something like that.
+# transformers = "^4.44.2"
+# accelerate = "^0.34.2"
+# flash-attn = "^2.6.3"
+# torch = "^2.4.0"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

login_mask_simple.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""
+WARNING: This file is duplicated in the projects: llm-app, tiktok. Make sure changes are reflected in all projects!
+Copied from https://docs.streamlit.io/knowledge-base/deploy/authentication-without-sso
+"""
+from functools import cache
+import logging
+import os
+import streamlit as st
+logger = logging.getLogger(__name__)
+def login_form():
+    with st.form("Credentials"):
+        st.text_input("Password", type="password", key="password")
+        st.form_submit_button("Log in", on_click=password_entered)
+def password_entered():
+    correct_pw = os.environ["LLMS_REST_API_KEY"]
+    is_correct: bool = st.session_state.pop("password") == correct_pw
+    st.session_state["password_correct"] = is_correct
+def check_password() -> bool:
+    """Return `True` if the user is allowed to access the app, `False` otherwise."""
+    skip_pw: bool = os.environ.get("USE_STREAMLIT_PASSWORD", "true").lower() == "false"
+    if skip_pw:
+        log_password_check_skipped()
+        return True
+    """Returns `True` if the user had a correct password."""
+    # Return True if the username + password is validated.
+    if st.session_state.get("password_correct", False):
+        return True
+    # Show inputs for username + password.
+    login_form()
+    if "password_correct" in st.session_state:
+        st.error("😕 Password incorrect")
+    return False
+@cache  # Print only once per session
+def log_password_check_skipped():
+    logger.info("Skipping password check because USE_STREAMLIT_PASSWORD=false.")

readme/llm-app-demo.mp4 ADDED Viewed

Binary file (884 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pytest
+deepdiff
+pillow
+openai
+transformers
+torch
+streamlit
+bitsandbytes
+accelerate
+flash-attn
+fastapi[standard]
+./llmlib

rest_api.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from fastapi import FastAPI
+from llmlib.rest_api.restapi_server import create_fastapi_app
+app: FastAPI = create_fastapi_app()

st_app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+from PIL import Image
+import streamlit as st
+from llmlib.runtime import filled_model_registry
+from llmlib.model_registry import ModelEntry, ModelRegistry
+from llmlib.base_llm import Message
+from llmlib.bundler import Bundler
+from llmlib.bundler_request import BundlerRequest
+from login_mask_simple import check_password
+if not check_password():
+    st.stop()
+st.set_page_config(page_title="LLM App", layout="wide")
+st.title("LLM App")
+model_registry: ModelRegistry = filled_model_registry()
+@st.cache_resource()
+def create_model_bundler() -> Bundler:
+    return Bundler(registry=model_registry)
+def display_warnings(r: ModelRegistry, model_id: str) -> None:
+    e1: ModelEntry = r.get_entry(model_id)
+    if len(e1.warnings) > 0:
+        st.warning("  \n".join(e1.warnings))
+cs = st.columns(2)
+with cs[0]:
+    model1_id: str = st.selectbox("Select model", model_registry.all_model_ids())
+    display_warnings(model_registry, model1_id)
+with cs[1]:
+    if "img-key" not in st.session_state:
+        st.session_state["img-key"] = 0
+    image = st.file_uploader("Include an image", key=st.session_state["img-key"])
+if "messages1" not in st.session_state:
+    st.session_state.messages1 = []  # list[Message]
+    st.session_state.messages2 = []  # list[Message]
+if st.button("Restart chat"):
+    st.session_state.messages1 = []  # list[Message]
+    st.session_state.messages2 = []  # list[Message]
+def render_messages(msgs: list[Message]) -> None:
+    for msg in msgs:
+        render_message(msg)
+def render_message(msg: Message):
+    with st.chat_message(msg.role):
+        if msg.img_name is not None:
+            render_img(msg)
+        st.markdown(msg.msg)
+def render_img(msg: Message):
+    st.image(msg.img, caption=msg.img_name, width=400)
+n_cols = 1
+cs = st.columns(n_cols)
+render_messages(st.session_state.messages1)
+prompt = st.chat_input("Type here")
+if prompt is None:
+    st.stop()
+msg = Message(
+    role="user",
+    msg=prompt,
+    img_name=image.name if image is not None else None,
+    img=Image.open(image) if image is not None else None,
+)
+if image is not None:
+    st.session_state["img-key"] += 1
+st.session_state.messages1.append(msg)
+render_message(msg)
+model_bundler: Bundler = create_model_bundler()
+with st.spinner("Initializing model..."):
+    model_bundler.set_model_on_gpu(model_id=model1_id)
+with st.spinner("Generating response..."):
+    req = BundlerRequest(model_id=model1_id, msgs=st.session_state.messages1)
+    response = model_bundler.get_response(req)
+msg = Message(role="assistant", msg=response)
+st.session_state.messages1.append(msg)
+render_message(msg)

test-files/.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+*.jpg
+*.png
+*.mp3
+*.mp4
+*.flac

test-files/toxicity-prompt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Evaluate whether this image could potentially be harmful and if so, explain why

tests/__init__.py ADDED Viewed

File without changes

tests/helpers.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+from pathlib import Path
+import PIL
+from llmlib.base_llm import LLM, Message
+import pytest
+def assert_model_knows_capital_of_france(model: LLM) -> None:
+    response: str = model.complete_msgs2(
+        msgs=[Message(role="user", msg="What is the capital of France?")]
+    )
+    assert "paris" in response.lower()
+def assert_model_can_answer_batch_of_text_prompts(model: LLM) -> None:
+    prompts = [
+        "What is the capital of France?",
+        "What continent is south of Europe?",
+        "What are the two tallest mountains in the world?",
+    ]
+    batch = [[Message.from_prompt(prompt)] for prompt in prompts]
+    responses = model.complete_batch(batch=batch)
+    assert len(responses) == 3
+    assert "paris" in responses[0].lower()
+    assert "africa" in responses[1].lower()
+    assert "everest" in responses[2].lower()
+def assert_model_can_answer_batch_of_img_prompts(model: LLM) -> None:
+    batch = [
+        [pyramid_message()],
+        [forest_message()],
+        [fish_message()],
+    ]
+    responses = model.complete_batch(batch=batch)
+    assert len(responses) == 3
+    assert "pyramid" in responses[0].lower()
+    assert "forest" in responses[1].lower()
+    assert "fish" in responses[2].lower()
+def assert_model_rejects_unsupported_batches(model: LLM) -> None:
+    mixed_textonly_and_img_batch = [
+        [Message.from_prompt("What is the capital of France?")],
+        [pyramid_message()],
+    ]
+    err_msg = "Batch must contain an image in every entry or none at all."
+    with pytest.raises(ValueError, match=err_msg):
+        model.complete_batch(mixed_textonly_and_img_batch)
+def assert_model_recognizes_pyramid_in_image(model: LLM):
+    msg = pyramid_message()
+    answer: str = model.complete_msgs2(msgs=[msg])
+    assert "pyramid" in answer.lower()
+def assert_model_recognizes_afd_in_video(model: LLM):
+    video_path = file_for_test("video.mp4")
+    question = "Describe the video in english"
+    answer: str = model.video_prompt(video_path, question)
+    assert "alternative für deutschland" in answer.lower(), answer
+def get_mona_lisa_completion(model: LLM) -> str:
+    msg: Message = mona_lisa_message()
+    answer: str = model.complete_msgs2(msgs=[msg])
+    return answer
+def mona_lisa_message() -> Message:
+    _, img = mona_lisa_filename_and_img()
+    prompt = "What is in the image?"
+    msg = Message(role="user", msg=prompt, img=img, img_name="")
+    return msg
+def pyramid_message() -> Message:
+    img_name = "pyramid.jpg"
+    img = get_test_img(img_name)
+    msg = Message(role="user", msg="What is in the image?", img=img, img_name="")
+    return msg
+def forest_message() -> Message:
+    img_name = "forest.jpg"
+    img = get_test_img(img_name)
+    msg = Message(
+        role="user", msg="Describe what you see in the picture.", img=img, img_name=""
+    )
+    return msg
+def fish_message() -> Message:
+    img_name = "fish.jpg"
+    img = get_test_img(img_name)
+    msg = Message(
+        role="user",
+        msg="What animal is depicted and where does it live?",
+        img=img,
+        img_name="",
+    )
+    return msg
+def mona_lisa_filename_and_img() -> tuple[str, PIL.Image.Image]:
+    img_name = "mona-lisa.png"
+    img = get_test_img(img_name)
+    return img_name, img
+def get_test_img(name: str) -> PIL.Image.Image:
+    path = file_for_test(name)
+    return PIL.Image.open(path)
+def file_for_test(name: str) -> Path:
+    return Path(__file__).parent.parent / "test-files" / name
+def is_ci() -> bool:
+    is_ci_str: str = os.environ.get("CI", "false").lower()
+    return is_ci_str != "false"

tests/test_bundler.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from dataclasses import dataclass
+from llmlib.bundler import Bundler
+from llmlib.bundler_request import BundlerRequest
+from llmlib.base_llm import LLM, Message
+import pytest
+from llmlib.model_registry import ModelEntry, ModelRegistry
+def test_model_id_on_gpu():
+    b = Bundler(filled_model_registry())
+    assert b.id_of_model_on_gpu() is None
+    b.set_model_on_gpu(GpuLLM.model_id)
+    assert b.id_of_model_on_gpu() == GpuLLM.model_id
+def test_get_response():
+    b = Bundler(filled_model_registry())
+    msgs = [Message(role="user", msg="hello")]
+    request = BundlerRequest(model_id=GpuLLM.model_id, msgs=msgs)
+    expected_response = GpuLLM().complete_msgs2(msgs)
+    actual_response: str = b.get_response(request)
+    assert actual_response == expected_response
+    assert b.id_of_model_on_gpu() == GpuLLM.model_id
+def test_bundler_multiple_responses():
+    b = Bundler(filled_model_registry())
+    models = [GpuLLM(), GpuLLM2(), NonGpuLLM()]
+    msgs = [Message(role="user", msg="hello")]
+    expected_responses = [m.complete_msgs2(msgs) for m in models]
+    assert expected_responses[0] != expected_responses[1]
+    actual_responses = [
+        b.get_response(BundlerRequest(model_id=m.model_id, msgs=msgs)) for m in models
+    ]
+    assert actual_responses == expected_responses
+    last_gpu_model = [m for m in models if m.requires_gpu_exclusively][-1]
+    assert b.id_of_model_on_gpu() == last_gpu_model.model_id
+def test_set_model_on_gpu():
+    b = Bundler(filled_model_registry())
+    b.set_model_on_gpu(GpuLLM.model_id)
+    assert b.id_of_model_on_gpu() == GpuLLM.model_id
+    with pytest.raises(AssertionError):
+        b.set_model_on_gpu("invalid")
+    assert b.id_of_model_on_gpu() == GpuLLM.model_id
+    b.set_model_on_gpu(NonGpuLLM.model_id)
+    gpu_model_is_still_loaded: bool = b.id_of_model_on_gpu() == GpuLLM.model_id
+    assert gpu_model_is_still_loaded
+def filled_model_registry() -> ModelRegistry:
+    model_entries = [
+        ModelEntry.from_cls_with_id(GpuLLM),
+        ModelEntry.from_cls_with_id(GpuLLM2),
+        ModelEntry.from_cls_with_id(NonGpuLLM),
+    ]
+    return ModelRegistry(model_entries)
+@dataclass
+class GpuLLM(LLM):
+    model_id = "gpu-llm-model"
+    requires_gpu_exclusively = True
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        return "gpu msg"
+@dataclass
+class GpuLLM2(LLM):
+    model_id = "gpu-llm-model-2"
+    requires_gpu_exclusively = True
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        return "gpu msg 2"
+@dataclass
+class NonGpuLLM(LLM):
+    model_id = "non-gpu-llm-model"
+    requires_gpu_exclusively = False
+    def complete_msgs2(self, msgs: list[Message]) -> str:
+        return "non-gpu message"

tests/test_gemini.py ADDED Viewed

	@@ -0,0 +1,25 @@

+from pathlib import Path
+from llmlib.gemini.media_description import Request
+import pytest
+from tests.helpers import file_for_test, is_ci
+@pytest.mark.skipif(condition=is_ci(), reason="Avoid costs")
+def test_gemini_vision():
+    files: list[Path] = [
+        file_for_test("pyramid.jpg"),
+        file_for_test("mona-lisa.png"),
+        file_for_test("some-audio.mp3"),
+    ]
+    for path in files:
+        assert path.exists()
+    req = Request(
+        media_files=files, prompt="Describe this combined images/audio/text in detail."
+    )
+    description: str = req.fetch_media_description().lower()
+    assert "pyramid" in description
+    assert "mona lisa" in description
+    assert "horses are very fast" in description

tests/test_llama3.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from llmlib.base_llm import LLM
+import pytest
+from llmlib.llama3.llama3_vision_70b_quantized import LLama3Vision70BQuantized
+from llmlib.llama3.llama3_vision_8b import LLama3Vision8B
+from .helpers import (
+    assert_model_knows_capital_of_france,
+    assert_model_recognizes_pyramid_in_image,
+    is_ci,
+)
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_llama_8b():
+    model: LLM = LLama3Vision8B()
+    assert_model_knows_capital_of_france(model)
+    assert_model_recognizes_pyramid_in_image(model)
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_llama_70b_quantized():
+    model: LLM = LLama3Vision70BQuantized()
+    assert_model_knows_capital_of_france(model)
+    # model cannot recognize mona lisa yet
+    # assert_model_recognized_mona_lisa_in_image(model)

tests/test_minicpm.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from llmlib.minicpm import MiniCPM
+import pytest
+from .helpers import (
+    assert_model_knows_capital_of_france,
+    assert_model_recognizes_afd_in_video,
+    assert_model_recognizes_pyramid_in_image,
+    is_ci,
+)
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_minicpm_vision():
+    model = MiniCPM()
+    assert_model_knows_capital_of_france(model)
+    assert_model_recognizes_pyramid_in_image(model)
+    assert_model_recognizes_afd_in_video(model)

tests/test_openai.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from llmlib.base_llm import LLM, Message
+from PIL import Image
+from llmlib.rest_api.restapi_client import encode_as_png_in_base64
+import pytest
+from llmlib.openai.openai_completion import (
+    OpenAIModel,
+    extract_msgs,
+)
+from deepdiff import DeepDiff
+from .helpers import (
+    assert_model_knows_capital_of_france,
+    assert_model_recognizes_pyramid_in_image,
+    is_ci,
+)
+def test_extract_msgs():
+    img = Image.new(mode="RGB", size=(1, 1))
+    msgs = [
+        Message(role="user", msg="Hi"),
+        Message(role="assistant", msg="Hi!"),
+        Message(role="user", msg="Describe:", img=img, img_name="img1"),
+    ]
+    messages = extract_msgs(msgs)
+    expected_msgs = [
+        {"role": "user", "content": "Hi"},
+        {"role": "assistant", "content": "Hi!"},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Describe:"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{encode_as_png_in_base64(img)}",
+                    },
+                },
+            ],
+        },
+    ]
+    assert DeepDiff(messages, expected_msgs) == {}
+@pytest.mark.skipif(condition=is_ci(), reason="Avoid costs")
+def test_openai_vision():
+    model: LLM = OpenAIModel()
+    assert_model_knows_capital_of_france(model)
+    assert_model_recognizes_pyramid_in_image(model)

tests/test_phi_3.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from llmlib.base_llm import Message
+from PIL import Image
+from llmlib.phi3.phi3 import GenConf, Phi3Vision, extract_imgs_and_dicts, pad_left
+import pytest
+import torch
+from .helpers import (
+    assert_model_can_answer_batch_of_img_prompts,
+    assert_model_can_answer_batch_of_text_prompts,
+    assert_model_knows_capital_of_france,
+    assert_model_rejects_unsupported_batches,
+    get_mona_lisa_completion,
+    is_ci,
+)
+def test_extract_imgs_and_dicts():
+    img1 = Image.new(mode="RGB", size=(1, 1))
+    img2 = Image.new(mode="RGB", size=(1, 1))
+    msgs = [
+        a_msg(),
+        a_msg(img=img1, img_name="img1"),
+        a_msg(img=img2, img_name="img2"),
+        a_msg(),
+        a_msg(img=img1, img_name="img1"),
+        a_msg(img=img2, img_name="img2"),
+    ]
+    images, messages = extract_imgs_and_dicts(msgs)
+    assert len(images) == 2
+    assert len(messages) == 6
+    assert "<|image_1|>" in messages[1]["content"]
+    assert "<|image_1|>" in messages[4]["content"]
+    assert "<|image_2|>" in messages[5]["content"]
+    assert "<|image_2|>" in messages[2]["content"]
+def a_msg(img: Image.Image | None = None, img_name: str | None = None) -> Message:
+    return Message(role="user", msg="", img=img, img_name=img_name)
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_phi3_vision(model: Phi3Vision):
+    assert_model_knows_capital_of_france(model)
+    answer: str = get_mona_lisa_completion(model)
+    assert isinstance(answer, str)
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_phi3_batching(model: Phi3Vision):
+    assert_model_can_answer_batch_of_text_prompts(model)
+    assert_model_can_answer_batch_of_img_prompts(model)
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_phi3_invalid_input(model: Phi3Vision):
+    assert_model_rejects_unsupported_batches(model)
+@pytest.fixture(scope="module")
+def model():
+    yield Phi3Vision(GenConf(max_new_tokens=30))
+def test_padleft():
+    pad_token = -1
+    seqs = [torch.tensor([1, 2, 3]), torch.tensor([4, 5]), torch.tensor([6])]
+    expected = torch.tensor([[1, 2, 3], [pad_token, 4, 5], [pad_token, pad_token, 6]])
+    actual = pad_left(seqs, pad_token)
+    assert torch.equal(actual, expected)

tests/test_rest_api.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from fastapi.testclient import TestClient
+from llmlib.bundler_request import BundlerRequest
+import llmlib.rest_api.restapi_client as llmclient
+from llmlib.rest_api.restapi_server import create_fastapi_app
+from llmlib.phi3.phi3 import Phi3Vision
+import pytest
+from .helpers import is_ci, mona_lisa_message
+def app():
+    return TestClient(create_fastapi_app())
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_rest_api_get_completion():
+    breq: BundlerRequest = _mona_lisa_request()
+    response = llmclient.get_completion_from_rest_api(source=app(), breq=breq)
+    assert response.status_code == 200, response.content
+    assert "portrait" in response.json()["response"].lower()
+def test_rest_api_get_models():
+    response = llmclient.get_models(source=app())
+    assert response.status_code == 200, response.content
+    assert len(response.json()) > 3
+@pytest.mark.skip(reason="This test requires the REST API to be running")
+def test_rest_api_integration_test():
+    breq: BundlerRequest = _mona_lisa_request()
+    response = llmclient.get_completion_from_rest_api(breq)
+    llmclient.clear_gpu()
+    assert response.status_code == 200, response.content
+    assert "portrait" in response.json()["response"].lower()
+def _mona_lisa_request() -> BundlerRequest:
+    msg = mona_lisa_message()
+    some_valid_modelid: str = Phi3Vision.model_id
+    return BundlerRequest(model_id=some_valid_modelid, msgs=[msg])

tests/test_whisper.py ADDED Viewed

	@@ -0,0 +1,40 @@

+from llmlib.whisper import Whisper
+import pytest
+from tests.helpers import is_ci, file_for_test
+@pytest.fixture(scope="module")
+def model() -> Whisper:
+    return Whisper()
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_transcription(model: Whisper):
+    audio_file = str(file_for_test(name="some-audio.flac"))  # Librispeech sample 2
+    expected_transcription = "before he had time to answer a much encumbered vera burst into the room with the question i say can i leave these here these were a small black pig and a lusty specimen of black-red game-cock"
+    actual_transcription: str = model.transcribe_file(audio_file)
+    assert actual_transcription == expected_transcription
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_video_transcription(model: Whisper):
+    video_file = str(file_for_test("video.mp4"))
+    expected_fragment = (
+        "Die Unionsparteien oder deren Politiker sind heute wichtige Offiziere"
+    )
+    transcription = model.transcribe_file(video_file)
+    assert expected_fragment in transcription
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_translation(model: Whisper):
+    german_video = str(file_for_test("video.mp4"))
+    translation: str = model.transcribe_file(german_video, translate=True)
+    assert "The parties and their politicians" in translation
+@pytest.mark.skipif(condition=is_ci(), reason="No GPU in CI")
+def test_long_video_transcription(model: Whisper):
+    video_file = str(file_for_test("long-video.mp4"))
+    transcription: str = model.transcribe_file(video_file)
+    assert isinstance(transcription, str)