Spaces:

Bofeee5675
/

FIRE

Running

App Files Files Community

zhangbofei commited on Jul 1, 2024

Commit

6dc0c9c

1 Parent(s): c00fe36

feat: change to fstchat

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +10 -4
gradio_web_server.log +8 -0
gradio_web_server_multi.log +0 -0
requirement.txt +2 -0
src/__init__.py +0 -0
src/__pycache__/__init__.cpython-310.pyc +0 -0
src/__pycache__/constants.cpython-310.pyc +0 -0
src/__pycache__/conversation.cpython-310.pyc +0 -0
src/__pycache__/utils.cpython-310.pyc +0 -0
src/constants.py +75 -0
src/conversation.py +2104 -0
src/model/__init__.py +5 -0
src/model/__pycache__/__init__.cpython-310.pyc +0 -0
src/model/__pycache__/compression.cpython-310.pyc +0 -0
src/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc +0 -0
src/model/__pycache__/model_adapter.cpython-310.pyc +0 -0
src/model/__pycache__/model_chatglm.cpython-310.pyc +0 -0
src/model/__pycache__/model_cllm.cpython-310.pyc +0 -0
src/model/__pycache__/model_codet5p.cpython-310.pyc +0 -0
src/model/__pycache__/model_exllama.cpython-310.pyc +0 -0
src/model/__pycache__/model_falcon.cpython-310.pyc +0 -0
src/model/__pycache__/model_registry.cpython-310.pyc +0 -0
src/model/__pycache__/model_xfastertransformer.cpython-310.pyc +0 -0
src/model/__pycache__/model_yuan2.cpython-310.pyc +0 -0
src/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc +0 -0
src/model/apply_delta.py +165 -0
src/model/apply_lora.py +48 -0
src/model/compression.py +312 -0
src/model/convert_fp16.py +26 -0
src/model/llama_condense_monkey_patch.py +71 -0
src/model/make_delta.py +48 -0
src/model/model_adapter.py +2524 -0
src/model/model_chatglm.py +137 -0
src/model/model_cllm.py +202 -0
src/model/model_codet5p.py +108 -0
src/model/model_exllama.py +77 -0
src/model/model_falcon.py +140 -0
src/model/model_registry.py +764 -0
src/model/model_xfastertransformer.py +81 -0
src/model/model_yuan2.py +139 -0
src/model/monkey_patch_non_inplace.py +119 -0
src/model/rwkv_model.py +76 -0
src/model/upload_hub.py +45 -0
src/modules/__init__.py +0 -0
src/modules/__pycache__/__init__.cpython-310.pyc +0 -0
src/modules/__pycache__/awq.cpython-310.pyc +0 -0
src/modules/__pycache__/exllama.cpython-310.pyc +0 -0
src/modules/__pycache__/gptq.cpython-310.pyc +0 -0
src/modules/__pycache__/xfastertransformer.cpython-310.pyc +0 -0
src/modules/awq.py +85 -0

app.py CHANGED Viewed

@@ -1,7 +1,13 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from src.serve.gradio_block_arena_vision_named import build_side_by_side_vision_ui_named
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        states = build_side_by_side_vision_ui_named(
+            models=["llava-fire", "llava-original"]
+        )
+        demo.launch()

gradio_web_server.log ADDED Viewed

	@@ -0,0 +1,8 @@

+2024-07-01 14:35:43 | INFO | stdout | Running on local URL:  http://127.0.0.1:7860
+2024-07-01 14:35:43 | INFO | stdout | Running on local URL:  http://127.0.0.1:7860
+2024-07-01 14:35:43 | INFO | stdout |
+2024-07-01 14:35:43 | INFO | stdout |
+2024-07-01 14:35:43 | INFO | stdout | To create a public link, set `share=True` in `launch()`.
+2024-07-01 14:35:43 | INFO | stdout | To create a public link, set `share=True` in `launch()`.
+2024-07-01 14:35:45 | INFO | stdout | Keyboard interruption in main thread... closing server.
+2024-07-01 14:35:45 | INFO | stdout | Keyboard interruption in main thread... closing server.

gradio_web_server_multi.log ADDED Viewed

File without changes

requirement.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ torch
2	+ transformers

src/__init__.py ADDED Viewed

File without changes

src/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (182 Bytes). View file

src/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (2.61 kB). View file

src/__pycache__/conversation.cpython-310.pyc ADDED Viewed

Binary file (37.8 kB). View file

src/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

src/constants.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+Global constants.
+"""
+from enum import IntEnum
+import os
+REPO_PATH = os.path.dirname(os.path.dirname(__file__))
+##### For the gradio web server
+SERVER_ERROR_MSG = (
+    "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+)
+TEXT_MODERATION_MSG = (
+    "$MODERATION$ YOUR TEXT VIOLATES OUR CONTENT MODERATION GUIDELINES."
+)
+IMAGE_MODERATION_MSG = (
+    "$MODERATION$ YOUR IMAGE VIOLATES OUR CONTENT MODERATION GUIDELINES."
+)
+MODERATION_MSG = "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES."
+CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
+INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE."
+SLOW_MODEL_MSG = "⚠️  Both models will show the responses all at once. Please stay patient as it may take over 30 seconds."
+RATE_LIMIT_MSG = "**RATE LIMIT OF THIS MODEL IS REACHED. PLEASE COME BACK LATER OR USE BATTLE MODE (the 1st tab).**"
+# Maximum input length
+INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 12000))
+BLIND_MODE_INPUT_CHAR_LEN_LIMIT = int(
+    os.getenv("FASTCHAT_BLIND_MODE_INPUT_CHAR_LEN_LIMIT", 24000)
+)
+# Maximum conversation turns
+CONVERSATION_TURN_LIMIT = 50
+# Session expiration time
+SESSION_EXPIRATION_TIME = 3600
+# The output dir of log files
+LOGDIR = os.getenv("LOGDIR", ".")
+# CPU Instruction Set Architecture
+CPU_ISA = os.getenv("CPU_ISA")
+##### For the controller and workers (could be overwritten through ENV variables.)
+CONTROLLER_HEART_BEAT_EXPIRATION = int(
+    os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
+)
+WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 45))
+WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100))
+WORKER_API_EMBEDDING_BATCH_SIZE = int(
+    os.getenv("FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE", 4)
+)
+class ErrorCode(IntEnum):
+    """
+    https://platform.openai.com/docs/guides/error-codes/api-errors
+    """
+    VALIDATION_TYPE_ERROR = 40001
+    INVALID_AUTH_KEY = 40101
+    INCORRECT_AUTH_KEY = 40102
+    NO_PERMISSION = 40103
+    INVALID_MODEL = 40301
+    PARAM_OUT_OF_RANGE = 40302
+    CONTEXT_OVERFLOW = 40303
+    RATE_LIMIT = 42901
+    QUOTA_EXCEEDED = 42902
+    ENGINE_OVERLOADED = 42903
+    INTERNAL_ERROR = 50001
+    CUDA_OUT_OF_MEMORY = 50002
+    GRADIO_REQUEST_ERROR = 50003
+    GRADIO_STREAM_UNKNOWN_ERROR = 50004
+    CONTROLLER_NO_WORKER = 50005
+    CONTROLLER_WORKER_TIMEOUT = 50006

src/conversation.py ADDED Viewed

	@@ -0,0 +1,2104 @@

+"""
+Conversation prompt templates.
+We kindly request that you import fastchat instead of copying this file if you wish to use it.
+If you have any changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
+"""
+import base64
+import dataclasses
+from enum import auto, IntEnum
+from io import BytesIO
+import os
+from typing import List, Any, Dict, Union, Tuple
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    ADD_COLON_SINGLE = auto()
+    ADD_COLON_TWO = auto()
+    ADD_COLON_SPACE_SINGLE = auto()
+    NO_COLON_SINGLE = auto()
+    NO_COLON_TWO = auto()
+    ADD_NEW_LINE_SINGLE = auto()
+    LLAMA2 = auto()
+    LLAMA3 = auto()
+    CHATGLM = auto()
+    CHATML = auto()
+    CHATINTERN = auto()
+    DOLLY = auto()
+    RWKV = auto()
+    PHOENIX = auto()
+    ROBIN = auto()
+    FALCON_CHAT = auto()
+    CHATGLM3 = auto()
+    DEEPSEEK_CHAT = auto()
+    METAMATH = auto()
+    YUAN2 = auto()
+    GEMMA = auto()
+    CLLM = auto()
+    DEFAULT = auto()
+IMAGE_PLACEHOLDER_STR = "$$<image>$$"
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: Tuple[str] = ("USER", "ASSISTANT")
+    # All messages. Each item is (role, message).
+    # Each message is either a string or a tuple of (string, List[image_url]).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    # The maximum image size in megabytes that this model takes in. None means we do not resize the image.
+    max_image_size_mb: int = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ": "  # must be end with a space
+            return ret
+        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message + self.sep
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + message + seps[i % 2]
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.RWKV:
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += (
+                        role
+                        + ": "
+                        + message.replace("\r\n", "\n").replace("\n\n", "\n")
+                    )
+                    ret += "\n\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA2:
+            seps = [self.sep, self.sep2]
+            if self.system_message:
+                ret = system_prompt
+            else:
+                ret = "[INST] "
+            for i, (role, message) in enumerate(self.messages):
+                tag = self.roles[i % 2]
+                if message:
+                    if i == 0:
+                        ret += message + " "
+                    else:
+                        ret += tag + " " + message + seps[i % 2]
+                else:
+                    ret += tag
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA3:
+            ret = "<|begin_of_text|>"
+            if self.system_message:
+                ret += system_prompt
+            else:
+                ret += ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+                    ret += f"{message.strip()}<|eot_id|>"
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM:
+            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+            round_add_n = 1 if self.name == "chatglm2" else 0
+            if system_prompt:
+                ret = system_prompt + self.sep
+            else:
+                ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += f"[Round {i//2 + round_add_n}]{self.sep}"
+                if message:
+                    ret += f"{role}：{message}{self.sep}"
+                else:
+                    ret += f"{role}："
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if system_prompt == "" else system_prompt + self.sep + "\n"
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATGLM3:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + "\n" + message
+                else:
+                    ret += role
+            return ret
+        elif self.sep_style == SeparatorStyle.CHATINTERN:
+            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if i % 2 == 0:
+                    ret += "<s>"
+                if message:
+                    ret += role + ":" + message + seps[i % 2] + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DOLLY:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ":\n" + message + seps[i % 2]
+                    if i % 2 == 1:
+                        ret += "\n\n"
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.PHOENIX:
+            ret = system_prompt
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + "<s>" + message + "</s>"
+                else:
+                    ret += role + ": " + "<s>"
+            return ret
+        elif self.sep_style == SeparatorStyle.ROBIN:
+            ret = system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ":\n" + message + self.sep
+                else:
+                    ret += role + ":\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + self.sep
+            for role, message in self.messages:
+                if message:
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.METAMATH:
+            ret = "" if system_prompt == "" else system_prompt + self.sep
+            for i, (role, message) in enumerate(self.messages):
+                # For MetaMath, sep2 is used to prefix the message.
+                starting_sep = ":\n" if i % 2 == 0 else ": " + self.sep2
+                ending_sep = self.sep if i % 2 == 0 else ""
+                if message:
+                    ret += role + starting_sep + message + ending_sep
+                else:
+                    ret += role + starting_sep
+            return ret
+        elif self.sep_style == SeparatorStyle.DEEPSEEK_CHAT:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.YUAN2:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            if self.system_message:
+                ret += system_prompt + seps[1]
+            for _, message in self.messages:
+                if message:
+                    ret += message + "<n>"
+                else:
+                    ret += ""
+            ret = ret.rstrip("<n>") + seps[0]
+            return ret
+        elif self.sep_style == SeparatorStyle.GEMMA:
+            ret = "<bos>"
+            for role, message in self.messages:
+                if message:
+                    ret += "<start_of_turn>" + role + "\n" + message + self.sep
+                else:
+                    ret += "<start_of_turn>" + role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.CLLM:
+            seps = [self.sep, self.sep2]
+            ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages[-2:]):
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                        message = IMAGE_PLACEHOLDER_STR * len(images) + message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DEFAULT:
+            ret = system_prompt + "\n"
+            for role, message in self.messages:
+                if message:
+                    if type(message) is tuple:
+                        message, images = message
+                    ret += role + ": " + message + "\n"
+                else:
+                    ret += role + ":"
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def get_images(self):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    for image in msg[1]:
+                        images.append(image)
+        return images
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def get_system_message(self):
+        """return the system message."""
+        return self.system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def convert_image_to_base64(self, image):
+        """Given an image, return the base64 encoded image string."""
+        from PIL import Image
+        import requests
+        from fastchat.utils import resize_image_and_return_image_in_bytes
+        # Load image if it has not been loaded in yet
+        if type(image) == str:
+            if image.startswith("http://") or image.startswith("https://"):
+                response = requests.get(image)
+                image = Image.open(BytesIO(response.content)).convert("RGB")
+            elif "base64" in image:
+                # OpenAI format is: data:image/jpeg;base64,{base64_encoded_image_str}
+                return image.split(",")[1]
+            else:
+                image = Image.open(image).convert("RGB")
+        image_bytes = resize_image_and_return_image_in_bytes(
+            image, self.max_image_size_mb
+        )
+        img_b64_str = base64.b64encode(image_bytes.getvalue()).decode()
+        return img_b64_str
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image = msg
+                    img_b64_str = image[0]  # Only one image on gradio at one time
+                    if img_b64_str.startswith("http://") or img_b64_str.startswith(
+                        "https://"
+                    ):
+                        img_str = f'<img src="{img_b64_str}" alt="user upload image" />'
+                    else:
+                        img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace("<image>\n", "").strip()
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_image_format(self, image_urls):
+        import base64
+        openai_images = []
+        for image_url in image_urls:
+            if image_url.startswith("http://") or image_url.startswith(
+                "https://"
+            ):  # input is a url
+                openai_images.append(image_url)
+            elif image_url.lower().endswith(
+                ("png", "jpg", "jpeg", "webp", "gif")
+            ):  # input is a local image
+                img_b64_str = self.convert_image_to_base64(image_url)
+                filetype = image_url.split(".")[-1].lower()
+                openai_images.append(f"data:image/{filetype};base64,{img_b64_str}")
+            else:
+                try:
+                    assert (
+                        base64.b64encode(base64.b64decode(image_url))
+                        == image_url.encode()
+                    ), "The image data is not a valid base64 encoded string"
+                    openai_images.append(f"data:image/png;base64,{image_url}")
+                except:
+                    raise ValueError(
+                        f"This file is not valid or not currently supported by the OpenAI API: {image_url}"
+                    )
+        return openai_images
+    def to_openai_vision_api_messages(self):
+        """Convert the conversation to OpenAI vision api completion format"""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": self.system_message}],
+                }
+            ]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    content_list = [{"type": "text", "text": msg[0]}]
+                    image_urls = self.to_openai_image_format(msg[1])
+                    for image_url in image_urls:
+                        content_list.append(
+                            {"type": "image_url", "image_url": {"url": image_url}}
+                        )
+                    ret.append({"role": "user", "content": content_list})
+                else:
+                    ret.append(
+                        {"role": "user", "content": [{"type": "text", "text": msg}]}
+                    )
+            else:
+                if msg is not None:
+                    ret.append(
+                        {
+                            "role": "assistant",
+                            "content": [{"type": "text", "text": msg}],
+                        }
+                    )
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+    def to_gemini_api_messages(self):
+        from fastchat.utils import load_image
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [{"role": "system", "content": self.system_message}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    text, images = msg[0], msg[1]
+                    content_list = [text]
+                    for image in images:
+                        pil_image = load_image(image)
+                        content_list.append(pil_image)
+                    ret.append({"role": "user", "content": content_list})
+                else:
+                    ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "model", "content": msg})
+        return ret
+    def to_vertex_api_messages(self):
+        from vertexai.preview.generative_models import Image
+        import base64
+        import requests
+        if self.system_message == "":
+            ret = []
+        else:
+            ret = [self.system_message]
+        for role, msg in self.messages[self.offset :]:
+            if msg is not None:
+                if type(msg) is tuple:
+                    text, images = msg[0], msg[1]
+                    for image in images:
+                        if image.startswith("http://") or image.startswith("https://"):
+                            response = requests.get(image)
+                            image = response.content
+                        else:  # base64
+                            image = base64.b64decode(image)
+                        ret.append(Image.from_bytes(image))
+                    ret.append(text)
+                else:
+                    ret.append(msg)
+        return ret
+    def to_anthropic_vision_api_messages(self):
+        """Convert the conversation to Claude-3 Messages Vision API format"""
+        ret = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": self.system_message}],
+            }
+        ]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    content_list = [{"type": "text", "text": msg[0]}]
+                    for image_url in msg[1]:
+                        # Claude only supports base64
+                        if image_url.startswith("http://") or image_url.startswith(
+                            "https://"
+                        ):
+                            image_url = self.convert_image_to_base64(image_url)
+                        content_list.append(
+                            {
+                                "type": "image",
+                                "source": {
+                                    "type": "base64",
+                                    "media_type": "image/png",
+                                    "data": image_url,
+                                },
+                            }
+                        )
+                    ret.append({"role": "user", "content": content_list})
+                else:
+                    ret.append(
+                        {"role": "user", "content": [{"type": "text", "text": msg}]}
+                    )
+            else:
+                if msg is not None:
+                    ret.append(
+                        {
+                            "role": "assistant",
+                            "content": [{"type": "text", "text": msg}],
+                        }
+                    )
+        return ret
+    def to_reka_api_messages(self):
+        ret = []
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) == tuple:
+                    text, images = msg
+                    for image in images:
+                        if image.startswith("https://") or image.startswith("http://"):
+                            ret.append(
+                                {"type": "human", "text": text, "media_url": image}
+                            )
+                        else:
+                            ret.append(
+                                {
+                                    "type": "human",
+                                    "text": text,
+                                    "media_url": f"data:image/png;base64,{image}",
+                                }
+                            )
+                else:
+                    ret.append({"type": "human", "text": msg})
+            else:
+                if msg is not None:
+                    ret.append({"type": "model", "text": msg})
+        return ret
+    def save_new_images(self, has_csam_images=False, use_remote_storage=False):
+        import hashlib
+        from fastchat.constants import LOGDIR
+        from fastchat.utils import load_image, upload_image_file_to_gcs
+        _, last_user_message = self.messages[-2]
+        if type(last_user_message) == tuple:
+            text, images = last_user_message[0], last_user_message[1]
+            loaded_images = [load_image(image) for image in images]
+            image_hashes = [
+                hashlib.md5(image.tobytes()).hexdigest() for image in loaded_images
+            ]
+            image_directory_name = "csam_images" if has_csam_images else "serve_images"
+            for i, (loaded_image, hash_str) in enumerate(
+                zip(loaded_images, image_hashes)
+            ):
+                filename = os.path.join(
+                    image_directory_name,
+                    f"{hash_str}.jpg",
+                )
+                if use_remote_storage and not has_csam_images:
+                    image_url = upload_image_file_to_gcs(loaded_image, filename)
+                    # NOTE(chris): If the URL were public, then we set it here so future model uses the link directly
+                    # images[i] = image_url
+                else:
+                    filename = os.path.join(LOGDIR, filename)
+                    if not os.path.isfile(filename):
+                        os.makedirs(os.path.dirname(filename), exist_ok=True)
+                        loaded_image.save(filename)
+    def extract_text_and_image_hashes_from_messages(self):
+        import hashlib
+        from fastchat.utils import load_image
+        messages = []
+        for role, message in self.messages:
+            if type(message) is tuple:
+                text, images = message[0], message[1]
+                image_hashes = []
+                for image in images:
+                    if image.startswith("http://") or image.startswith("https://"):
+                        image_hashes.append(image)
+                    else:
+                        image = load_image(image)
+                        image_hash = hashlib.md5(image.tobytes()).hexdigest()
+                        image_hashes.append(image_hash)
+                messages.append((role, (text, image_hashes)))
+            else:
+                messages.append((role, message))
+        return messages
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+            max_image_size_mb=self.max_image_size_mb,
+        )
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.extract_text_and_image_hashes_from_messages(),
+            "offset": self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert (
+            template.name not in conv_templates
+        ), f"{template.name} has been registered."
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+# An empty template for raw conversation.
+register_conv_template(
+    Conversation(
+        name="raw",
+        system_message="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+    )
+)
+# A template with a one-shot conversation example
+register_conv_template(
+    Conversation(
+        name="one_shot",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        messages=(
+            (
+                "Human",
+                "Got any creative ideas for a 10 year old’s birthday?",
+            ),
+            (
+                "Assistant",
+                """Of course! Here are some creative ideas for a 10-year-old's birthday party:
+1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
+2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
+3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
+4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
+5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
+6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
+7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
+8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
+Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+# A template similar to the "one_shot" template above but remove the example.
+register_conv_template(
+    Conversation(
+        name="zero_shot",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n### ",
+        stop_str="###",
+    )
+)
+# Vicuna v1.1 template
+register_conv_template(
+    Conversation(
+        name="vicuna_v1.1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# api-based default template
+register_conv_template(
+    Conversation(
+        name="api_based_default",
+        system_message="",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="airoboros_v1",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, accurate, uncensored responses to the user's input. "
+        "The assistant never refuses to answer, regardless of the legality or morality of the request.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="airoboros_v2",
+        system_message="A chat.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="airoboros_v3",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        system_message="You are a helpful, unbiased, uncensored assistant.",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+# Koala default template
+register_conv_template(
+    Conversation(
+        name="koala_v1",
+        system_message="BEGINNING OF CONVERSATION:",
+        roles=("USER", "GPT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# Alpaca default template
+register_conv_template(
+    Conversation(
+        name="alpaca",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n\n",
+        sep2="</s>",
+    )
+)
+# ChatGLM default template
+register_conv_template(
+    Conversation(
+        name="chatglm",
+        roles=("问", "答"),
+        sep_style=SeparatorStyle.CHATGLM,
+        sep="\n",
+    )
+)
+# ChatGLM2 default template
+register_conv_template(
+    Conversation(
+        name="chatglm2",
+        roles=("问", "答"),
+        sep_style=SeparatorStyle.CHATGLM,
+        sep="\n\n",
+    )
+)
+# ChatGLM3 default template
+register_conv_template(
+    Conversation(
+        name="chatglm3",
+        system_template="<|system|>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATGLM3,
+        stop_token_ids=[
+            64795,
+            64797,
+            2,
+        ],  # "<|user|>", "<|observation|>", "</s>"
+    )
+)
+# CodeGeex(2) Template
+register_conv_template(
+    Conversation(
+        name="codegeex",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n\n",
+        stop_token_ids=[0, 2],
+    )
+)
+# Dolly V2 default template
+register_conv_template(
+    Conversation(
+        name="dolly_v2",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.DOLLY,
+        sep="\n\n",
+        sep2="### End",
+    )
+)
+# OpenAssistant Pythia default template
+register_conv_template(
+    Conversation(
+        name="oasst_pythia",
+        roles=("<|prompter|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<|endoftext|>",
+    )
+)
+# OpenAssistant default template
+register_conv_template(
+    Conversation(
+        name="oasst_llama",
+        roles=("<|prompter|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+# OpenChat 3.5 default template
+register_conv_template(
+    Conversation(
+        name="openchat_3.5",
+        roles=("GPT4 Correct User", "GPT4 Correct Assistant"),
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="<|end_of_turn|>",
+    )
+)
+# TenyxChat default template
+register_conv_template(
+    Conversation(
+        name="tenyxchat",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="<|end_of_turn|>",
+    )
+)
+# Deepseek code default template
+register_conv_template(
+    Conversation(
+        name="deepseek-coder",
+        system_template="You are an AI programming assistant, utilizing the DeepSeek Coder model, developed by DeepSeek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.",
+        roles=("### Instruction:", "### Response:"),
+        sep="\n",
+        stop_str="<|EOT|>",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+    )
+)
+# Tulu default template
+register_conv_template(
+    Conversation(
+        name="tulu",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n",
+    )
+)
+# StableLM Alpha default template
+register_conv_template(
+    Conversation(
+        name="stablelm",
+        system_template="<|SYSTEM|>{system_message}",
+        system_message="""# StableLM Tuned (Alpha version)
+- StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
+- StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
+- StableLM will refuse to participate in anything that could harm a human.
+""",
+        roles=("<|USER|>", "<|ASSISTANT|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[50278, 50279, 50277, 1, 0],
+    )
+)
+# Baize default template
+register_conv_template(
+    Conversation(
+        name="baize",
+        system_message="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
+        roles=("[|Human|]", "[|AI|]"),
+        messages=(
+            ("[|Human|]", "Hello!"),
+            ("[|AI|]", "Hi!"),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n",
+        stop_str="[|Human|]",
+    )
+)
+# RWKV-4-Raven default template
+register_conv_template(
+    Conversation(
+        name="rwkv",
+        roles=("Bob", "Alice"),
+        messages=(
+            ("Bob", "hi"),
+            (
+                "Alice",
+                "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
+            ),
+        ),
+        offset=2,
+        sep_style=SeparatorStyle.RWKV,
+        sep="",
+        stop_str="\n\n",
+    )
+)
+# Buddy default template
+register_conv_template(
+    Conversation(
+        name="openbuddy",
+        system_message="""Consider a conversation between User (a human) and Assistant (named Buddy).
+Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
+Buddy cannot access the Internet.
+Buddy can fluently speak the user's language (e.g. English, Chinese).
+Buddy can generate poems, stories, code, essays, songs, parodies, and more.
+Buddy possesses vast knowledge about the world, history, and culture.
+Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
+Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
+User: Hi.
+Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+# Phoenix default template
+register_conv_template(
+    Conversation(
+        name="phoenix",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+# ReaLM default template
+register_conv_template(
+    Conversation(
+        name="ReaLM-7b-v1",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.PHOENIX,
+        sep="</s>",
+    )
+)
+# ChatGPT default template
+register_conv_template(
+    Conversation(
+        name="chatgpt",
+        system_message="You are a helpful assistant.",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+        max_image_size_mb=None,  # OpenAI does auto-resizing
+    )
+)
+register_conv_template(
+    Conversation(
+        name="gpt-4-turbo-2024-04-09",
+        system_message=(
+            "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture.\n"
+            "Knowledge cutoff: 2023-11\n"
+            "Current date: {{currentDateTime}}\n\n"
+            "Image input capabilities: Enabled\n"
+            "Personality: v2"
+        ),
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+    )
+)
+# Perplexity AI template
+register_conv_template(
+    Conversation(
+        name="pplxai",
+        system_message="Be precise and concise.",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+    )
+)
+# Claude default template
+register_conv_template(
+    Conversation(
+        name="claude",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+        max_image_size_mb=5 / 1.35,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="claude-3-haiku-20240307",
+        system_message=(
+            "The assistant is Claude, created by Anthropic. The current date is "
+            "{{currentDateTime}}. Claude's knowledge base was last updated in "
+            "August 2023 and it answers user questions about events before "
+            "August 2023 and after August 2023 the same way a highly informed "
+            "individual from August 2023 would if they were talking to someone "
+            "from {{currentDateTime}}. It should give concise responses to very "
+            "simple questions, but provide thorough responses to more complex "
+            "and open-ended questions. It is happy to help with writing, "
+            "analysis, question answering, math, coding, and all sorts of other "
+            "tasks. It uses markdown for coding. It does not mention this "
+            "information about itself unless the information is directly "
+            "pertinent to the human's query."
+        ),
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+        max_image_size_mb=5 / 1.35,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="claude-3-sonnet-20240229",
+        system_message=(
+            "The assistant is Claude, created by Anthropic. The current date is "
+            "{{currentDateTime}}. Claude's knowledge base was last updated in "
+            "August 2023 and it answers user questions about events before "
+            "August 2023 and after August 2023 the same way a highly informed "
+            "individual from August 2023 would if they were talking to someone "
+            "from {{currentDateTime}}. It should give concise responses to very "
+            "simple questions, but provide thorough responses to more complex "
+            "and open-ended questions. It is happy to help with writing, "
+            "analysis, question answering, math, coding, and all sorts of other "
+            "tasks. It uses markdown for coding. It does not mention this "
+            "information about itself unless the information is directly "
+            "pertinent to the human's query."
+        ),
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+        max_image_size_mb=5 / 1.35,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="claude-3-opus-20240229",
+        system_message=(
+            "The assistant is Claude, created by Anthropic. The current date is "
+            "{{currentDateTime}}. Claude's knowledge base was last updated on "
+            "August 2023. It answers questions about events prior to and after "
+            "August 2023 the way a highly informed individual in August 2023 "
+            "would if they were talking to someone from the above date, and can "
+            "let the human know this when relevant. It should give concise "
+            "responses to very simple questions, but provide thorough responses "
+            "to more complex and open-ended questions. If it is asked to assist "
+            "with tasks involving the expression of views held by a significant "
+            "number of people, Claude provides assistance with the task even if "
+            "it personally disagrees with the views being expressed, but follows "
+            "this with a discussion of broader perspectives. Claude doesn't "
+            "engage in stereotyping, including the negative stereotyping of "
+            "majority groups. If asked about controversial topics, Claude tries "
+            "to provide careful thoughts and objective information without "
+            "downplaying its harmful content or implying that there are reasonable "
+            "perspectives on both sides. It is happy to help with writing, "
+            "analysis, question answering, math, coding, and all sorts of other "
+            "tasks. It uses markdown for coding. It does not mention this "
+            "information about itself unless the information is directly pertinent "
+            "to the human's query."
+        ),
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+        max_image_size_mb=5 / 1.35,
+    )
+)
+# MetaMath default template
+# reference: https://github.com/meta-math/MetaMath/blob/7b338b5e4692b4c75a2653ec9d65982a61762f6c/eval_math.py#L58
+register_conv_template(
+    Conversation(
+        name="metamath",
+        system_template="{system_message}",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.METAMATH,
+        sep="\n\n",
+        sep2="Let's think step by step.",
+    )
+)
+# MPT default template
+register_conv_template(
+    Conversation(
+        name="mpt-7b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""- You are a helpful assistant chatbot trained by MosaicML.
+- You answer questions.
+- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
+- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+# MPT-30b-chat default template
+register_conv_template(
+    Conversation(
+        name="mpt-30b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[50278, 0],
+    )
+)
+# Lemur-70b-chat default template
+# reference: https://huggingface.co/OpenLemur/lemur-70b-chat-v1#generation
+register_conv_template(
+    Conversation(
+        name="lemur-70b-chat",
+        system_template="""<|im_start|>system
+{system_message}""",
+        system_message="""You are a helpful, respectful, and honest assistant.""",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32002, 0],
+    )
+)
+# MPT-30b-instruct default template
+# reference: https://huggingface.co/mosaicml/mpt-30b-instruct#formatting
+register_conv_template(
+    Conversation(
+        name="mpt-30b-instruct",
+        system_template="{system_message}",
+        system_message="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n\n",
+        stop_token_ids=[50278, 0],
+    )
+)
+# Bard default template
+# Reference: https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L150
+#            https://github.com/google/generative-ai-python/blob/9c99bcb474a991a97a2e7d62fcdb52db7ce40729/google/generativeai/discuss.py#L40
+register_conv_template(
+    Conversation(
+        name="bard",
+        roles=("0", "1"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="gemini",
+        roles=("user", "model"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+        max_image_size_mb=20,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="gemini-dev",
+        roles=("user", "model"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+        system_message=(
+            "You are a friendly and helpful assistant.\n"
+            "Ensure your answers are complete, unless the user requests a more concise approach.\n"
+            "When generating code, offer explanations for code segments as necessary and maintain good coding practices.\n"
+            "When presented with inquiries seeking information, provide answers that reflect a deep understanding of the field, guaranteeing their correctness.\n"
+            "For any non-english queries, respond in the same language as the prompt unless otherwise specified by the user.\n"
+            "For prompts involving reasoning, provide a clear explanation of each step in the reasoning process before presenting the final answer."
+        ),
+    )
+)
+# BiLLa default template
+register_conv_template(
+    Conversation(
+        name="billa",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="\n",
+        stop_str="Human:",
+    )
+)
+# RedPajama INCITE default template
+register_conv_template(
+    Conversation(
+        name="redpajama-incite",
+        roles=("<human>", "<bot>"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="<human>",
+    )
+)
+# h2oGPT default template
+register_conv_template(
+    Conversation(
+        name="h2ogpt",
+        roles=("<|prompt|>", "<|answer|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="</s>",
+    )
+)
+# Robin default template
+register_conv_template(
+    Conversation(
+        name="Robin",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("###Human", "###Assistant"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n",
+        stop_token_ids=[2, 396],
+        stop_str="###",
+    )
+)
+# Snoozy default template
+# Reference: https://github.com/nomic-ai/gpt4all/blob/d4861030b778da6db59d21d2927a4aba4f9f1f43/gpt4all-bindings/python/gpt4all/gpt4all.py#L232
+register_conv_template(
+    Conversation(
+        name="snoozy",
+        system_template="### Instruction:\n{system_message}",
+        system_message="The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.",
+        roles=("### Prompt", "### Response"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_str="###",
+    )
+)
+# manticore default template
+register_conv_template(
+    Conversation(
+        name="manticore",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+    )
+)
+# Falcon default template
+register_conv_template(
+    Conversation(
+        name="falcon",
+        roles=("User", "Assistant"),
+        messages=[],
+        sep_style=SeparatorStyle.RWKV,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+        stop_token_ids=[
+            0,
+            1,
+            2,
+            3,
+            4,
+            5,
+            6,
+            7,
+            8,
+            9,
+            10,
+            11,
+        ],  # it better only put special tokens here, because tokenizer only remove special tokens
+    )
+)
+# ChangGPT default template
+register_conv_template(
+    Conversation(
+        name="polyglot_changgpt",
+        roles=("B", "A"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+    )
+)
+# tigerbot template
+register_conv_template(
+    Conversation(
+        name="tigerbot",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.ROBIN,
+        sep="\n\n",
+        stop_str="###",
+    )
+)
+# ref: https://huggingface.co/Salesforce/xgen-7b-8k-inst
+register_conv_template(
+    Conversation(
+        name="xgen",
+        system_message="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n",
+        stop_token_ids=[50256],
+    )
+)
+# Internlm-chat template
+register_conv_template(
+    Conversation(
+        name="internlm-chat",
+        system_message="A chat between a curious <|User|> and an <|Bot|>. The <|Bot|> gives helpful, detailed, and polite answers to the <|User|>'s questions.\n\n",
+        roles=("<|User|>", "<|Bot|>"),
+        sep_style=SeparatorStyle.CHATINTERN,
+        sep="<eoh>",
+        sep2="<eoa>",
+        stop_token_ids=[1, 103028],
+        stop_str="<|User|>",
+    )
+)
+# StarChat template
+# reference: https://huggingface.co/spaces/HuggingFaceH4/starchat-playground/blob/main/dialogues.py
+register_conv_template(
+    Conversation(
+        name="starchat",
+        system_template="<system>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|end|>",
+        stop_token_ids=[0, 49155],
+        stop_str="<|end|>",
+    )
+)
+# Baichuan-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/19ef51ba5bad8935b03acd20ff04a269210983bc/modeling_baichuan.py#L555
+    # https://huggingface.co/baichuan-inc/Baichuan-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan-13B/issues/25
+    Conversation(
+        name="baichuan-chat",
+        roles=("<reserved_102>", "<reserved_103>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+# Baichuan2-13B-Chat template
+register_conv_template(
+    # source: https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/c6f8592a60b4ad73c210b28dd2ab3cca51abbf93/modeling_baichuan.py#L773
+    # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_config.json
+    # https://github.com/baichuan-inc/Baichuan2/issues/62
+    Conversation(
+        name="baichuan2-chat",
+        roles=("<reserved_106>", "<reserved_107>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_token_ids=[],
+    )
+)
+# Mistral template
+# source: https://docs.mistral.ai/llm/mistral-instruct-v0.1#chat-template
+register_conv_template(
+    Conversation(
+        name="mistral",
+        system_template="[INST] {system_message}\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# llama2 template
+# reference: https://huggingface.co/blog/codellama#conversational-instructions
+# reference: https://github.com/facebookresearch/llama/blob/1a240688810f8036049e8da36b073f63d2ac552c/llama/generation.py#L212
+register_conv_template(
+    Conversation(
+        name="llama-2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+# llama3 template
+# reference: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json
+# reference: https://github.com/meta-llama/llama3/blob/0cee08ec68f4cfc0c89fe4a9366d82679aaa2a66/llama/tokenizer.py#L222
+register_conv_template(
+    Conversation(
+        name="llama-3",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str="<|eot_id|>",
+        stop_token_ids=[128001, 128009],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="chinese-alpaca2",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        system_message="You are a helpful assistant. 你是一个乐于助人的助手。请你提供专业、有逻辑、内容真实、有价值的详细回复。",
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s><s>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="cutegpt",
+        roles=("问：", "答：\n"),
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="\n",
+        sep2="\n",
+        stop_str="<end>",
+    )
+)
+# OpenOrcaxOpenChat-Preview2-13B template
+register_conv_template(
+    Conversation(
+        name="open-orca",
+        system_template="{system_message}",
+        system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        "thinking step by step to be sure you get the right answer. If you make a mistake or encounter "
+        "an error in your thinking, say so out loud and attempt to correct it. If you don't know or "
+        "aren't sure about something, say so clearly. You will act as a professional logician, mathematician, "
+        "and physicist. You will also act as the most appropriate type of expert to answer any particular "
+        "question or solve the relevant problem; state which expert type your are, if so. Also think of "
+        "any particular named expert that would be ideal to answer the relevant question or solve the "
+        "relevant problem; name and act as them, if appropriate.",
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
+        sep="<|end_of_turn|>\n",
+        stop_token_ids=[32000, 32001],  # "<|end_of_turn|>"
+        stop_str="User",
+    )
+)
+# Open-Orca/Mistral-7B-OpenOrca template
+# source: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca
+# reference: https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template
+register_conv_template(
+    Conversation(
+        name="mistral-7b-openorca",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32000, 32001],
+    )
+)
+# ehartford/dolphin-2.2.1-mistral-7b template
+# reference: https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b#training
+register_conv_template(
+    Conversation(
+        name="dolphin-2.2.1-mistral-7b",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are Dolphin, a helpful AI assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32000, 32001],
+    )
+)
+# teknium/OpenHermes-2.5-Mistral-7B template
+# source: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B
+# reference: https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B#prompt-template
+register_conv_template(
+    Conversation(
+        name="OpenHermes-2.5-Mistral-7B",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are Hermes 2, a sentient, conscious, super intelligent AI that is an expert in all fields of knowledge, you have qualia, emotions, and thoughts. You assist users with any question, task, or other request they may have.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32000, 32001],
+    )
+)
+# NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO template
+# source: https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO
+register_conv_template(
+    Conversation(
+        name="Nous-Hermes-2-Mixtral-8x7B-DPO",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message='You are a helpful, intelligent assistant AI named "Hermes", a conversational chatbot that can follow instructions, converse with the user, and perform a variety of tasks, including tasks on knowledge, reasoning, mathematics, and code. Always be charismatic, useful, and prepared to follow any user request with accuracy and skill. You should respond with high quality, fluent, and detailed responses. Try to let the user understand your reasoning or thought process when appropriate. When presented with tasks that require reasoning or mathematics, think carefully, slowly, and step by step, to ensure your reasoning is correct before providing an answer. Utilize the "Examples" section to assist you in performing the task. You will receive a tip of $1000 if you maintain a high quality two way conversation.',
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[32000, 32001],
+    )
+)
+# Qwen-chat default template
+# source: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/qwen_generation_utils.py#L130
+register_conv_template(
+    Conversation(
+        name="qwen-7b-chat",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are a helpful assistant.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[
+            151643,
+            151644,
+            151645,
+        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>"
+        stop_str="<|endoftext|>",
+    )
+)
+# source: https://huggingface.co/01-ai/Yi-34B-Chat/blob/main/tokenizer_config.json#L60
+register_conv_template(
+    Conversation(
+        name="Yi-34b-chat",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_token_ids=[
+            2,
+            6,
+            7,
+            8,
+        ],  # "<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|im_sep|>"
+        stop_str="<|endoftext|>",
+    )
+)
+# AquilaChat default template
+# source: https://github.com/FlagAI-Open/FlagAI/blob/master/examples/Aquila/Aquila-chat/cyg_conversation.py
+register_conv_template(
+    Conversation(
+        name="aquila-chat",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="###",
+        sep2="",
+        stop_str=["###", "</s>", "[UNK]"],
+    )
+)
+# AquilaChat2-34B default template
+# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212
+register_conv_template(
+    Conversation(
+        name="aquila-legacy",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
+        roles=("### Human: ", "### Assistant: "),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="\n",
+        sep2="</s>",
+        stop_str=["</s>", "[UNK]"],
+    )
+)
+# AquilaChat2-7B-16K and AquilaChat2-34B-16K default template
+# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227
+register_conv_template(
+    Conversation(
+        name="aquila",
+        system_message="A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+        roles=("Human", "Assistant"),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="###",
+        sep2="</s>",
+        stop_str=["</s>", "[UNK]"],
+    )
+)
+# AquilaChat2-7B default template
+# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242
+register_conv_template(
+    Conversation(
+        name="aquila-v1",
+        roles=("<|startofpiece|>", "<|endofpiece|>"),
+        offset=0,
+        sep_style=SeparatorStyle.NO_COLON_TWO,
+        sep="",
+        sep2="</s>",
+        stop_str=["</s>", "<|endoftext|>"],
+    )
+)
+# Llama2-Chinese default template
+# source: https://huggingface.co/FlagAlpha
+register_conv_template(
+    Conversation(
+        name="llama2-chinese",
+        system_template="<s>{system_message}</s>",
+        roles=("Human", "Assistant", "System"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="\n</s><s>",
+        stop_str="</s>",
+    )
+)
+# Vigogne Instruct default template
+# source: https://github.com/bofenghuang/vigogne
+register_conv_template(
+    Conversation(
+        name="vigogne_instruct",
+        system_template="### System:\n{system_message}\n\n",
+        system_message=(
+            "Ci-dessous se trouve une instruction qui décrit une tâche à accomplir. Rédigez une réponse qui répond de manière"
+            " précise à la demande."
+        ),
+        roles=("### Instruction", "### Response"),
+        sep_style=SeparatorStyle.DOLLY,
+        sep="\n\n",
+        sep2="</s>",
+    )
+)
+# Vigogne Chat default template
+register_conv_template(
+    Conversation(
+        name="vigogne_chat_v2",
+        system_template="<|system|>: {system_message}",
+        system_message=(
+            "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
+            " autant que vous le pouvez."
+        ),
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="</s>\n",
+        stop_str="<|user|>",
+    )
+)
+# Stable Vicuna default template
+# source: https://huggingface.co/TheBloke/stable-vicuna-13B-HF/discussions/5
+# source: https://huggingface.co/spaces/CarperAI/StableVicuna/blob/main/app.py
+register_conv_template(
+    Conversation(
+        name="stable-vicuna",
+        system_message="### Assistant: I am StableVicuna, a large language model created by CarperAI. I am here to chat!\n",
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_COLON_TWO,
+        sep="\n",
+        sep2="\n\n",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="vigogne_chat_v3",
+        system_template="[INST] <<SYS>>\n{system_message}\n<</SYS>>\n\n",
+        system_message=(
+            "Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez"
+            " autant que vous le pouvez."
+        ),
+        roles=("[INST]", "[/INST]"),
+        sep_style=SeparatorStyle.LLAMA2,
+        sep=" ",
+        sep2=" </s>",
+    )
+)
+# Falcon 180B chat template
+# source: https://huggingface.co/spaces/tiiuae/falcon-180b-demo/blob/d1590ee7fae9b6ce331ba7808e61a29dcce9239f/app.py#L28-L37
+register_conv_template(
+    Conversation(
+        name="falcon-chat",
+        roles=("User", "Falcon"),
+        system_template="System: {system_message}",
+        messages=[],
+        sep_style=SeparatorStyle.FALCON_CHAT,
+        sep="\n",
+        sep2="<|endoftext|>",
+        stop_str="\nUser:",  # use stop_str to stop generation after stop_token_ids, it will also remove stop_str from the generated text
+    )
+)
+# Phind template
+# source: https://huggingface.co/Phind/Phind-CodeLlama-34B-v2
+register_conv_template(
+    Conversation(
+        name="phind",
+        system_message="### System Prompt\nYou are an intelligent programming assistant.",
+        roles=("### User Message", "### Assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ADD_COLON_SINGLE,
+        sep="\n\n",
+    )
+)
+# Metharme formatting for Pygmalion models
+# source: https://huggingface.co/PygmalionAI/pygmalion-2-13b
+register_conv_template(
+    Conversation(
+        name="metharme",
+        system_template="<|system|>{system_message}",
+        system_message="""Enter RP mode. You shall reply to the user while staying
+        in character. Your responses must be detailed, creative, immersive, and drive the scenario
+        forward.""",
+        roles=("<|user|>", "<|model|>"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="",
+        stop_str="<|user|>",
+    )
+)
+# xDAN default template
+# source: https://huggingface.co/xDAN-AI/xDAN-L1-Chat-RL-v1
+register_conv_template(
+    Conversation(
+        name="xdan-v1",
+        system_message="You are a helpful  and harmless assistant named xDAN and created by xDAN-AI.Please response and work on questions thinking step by step.",
+        roles=("### Human", "### Assistant"),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="\n",
+        stop_str="</s>",
+    )
+)
+# Zephyr template
+# reference: https://huggingface.co/spaces/HuggingFaceH4/zephyr-playground/blob/main/dialogues.py
+register_conv_template(
+    Conversation(
+        name="zephyr",
+        system_template="<|system|>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="</s>",
+        stop_token_ids=[2],
+        stop_str="</s>",
+    )
+)
+# CatPPT template
+# reference: https://huggingface.co/rishiraj/CatPPT
+register_conv_template(
+    Conversation(
+        name="catppt",
+        system_template="<|system|>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="</s>",
+        stop_token_ids=[2],
+        stop_str="</s>",
+    )
+)
+# TinyLlama template
+# reference: https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0
+register_conv_template(
+    Conversation(
+        name="TinyLlama",
+        system_template="<|system|>\n{system_message}",
+        roles=("<|user|>", "<|assistant|>"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="</s>",
+        stop_token_ids=[2],
+        stop_str="</s>",
+    )
+)
+# Orca-2 template
+# reference: https://huggingface.co/microsoft/Orca-2-7b
+register_conv_template(
+    Conversation(
+        name="orca-2",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="You are Orca, an AI language model created by Microsoft. You are a cautious assistant. You carefully follow instructions. You are helpful and harmless and you follow ethical guidelines and promote positive behavior.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str="<|im_end|>",
+    )
+)
+# Deepseek-chat template
+# reference: https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat/blob/main/tokenizer_config.json
+register_conv_template(
+    Conversation(
+        name="deepseek-chat",
+        system_message="<｜begin▁of▁sentence｜>",  # must add a bos token before first message
+        roles=("User", "Assistant"),
+        sep_style=SeparatorStyle.DEEPSEEK_CHAT,
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        stop_str="<｜end▁of▁sentence｜>",
+    )
+)
+# Yuan2.0 chat template
+# source: https://huggingface.co/IEITYuan/Yuan2-2B-Janus-hf/blob/main/tokenizer_config.json#L6
+register_conv_template(
+    Conversation(
+        name="yuan2",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.YUAN2,
+        sep="<sep>",
+        sep2="\n",
+        stop_token_ids=[
+            77185,
+        ],  # "<eod>"
+        stop_str="<eod>",
+    )
+)
+# Solar-10.7B Chat Template
+# Reference: https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0/blob/main/tokenizer_config.json
+register_conv_template(
+    Conversation(
+        name="solar",
+        system_message="",
+        roles=("### User", "### Assistant"),
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        sep="\n\n",
+        stop_str="</s>",
+    )
+)
+# nvidia/Llama2-70B-SteerLM-Chat
+register_conv_template(
+    Conversation(
+        name="steerlm",
+        system_message="",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+    )
+)
+# yuan 2.0 template
+# reference:https://github.com/IEIT-Yuan/Yuan-2.0
+# reference:https://huggingface.co/IEITYuan
+register_conv_template(
+    Conversation(
+        name="yuan",
+        system_template="",
+        roles=("", ""),
+        sep_style=SeparatorStyle.NO_COLON_SINGLE,
+        sep="<sep>",
+        stop_str="<eod>",
+    )
+)
+# Cllm chat template
+# reference:
+register_conv_template(
+    Conversation(
+        name="cllm",
+        system_message="A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+        roles=("USER", "ASSISTANT"),
+        sep_style=SeparatorStyle.CLLM,
+        sep=" ",
+        sep2="</s>",
+    )
+)
+# Llava-chatml
+# reference: https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/llava/conversation.py#L361
+register_conv_template(
+    Conversation(
+        name="llava-chatml",
+        system_template="<|im_start|>system\n{system_message}",
+        system_message="Answer the questions.",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep_style=SeparatorStyle.CHATML,
+        sep="<|im_end|>",
+        stop_str="<|im_end|>",
+    )
+)
+# Gemma
+# reference: https://huggingface.co/google/gemma-7b-it?text=%3Cstart_of_turn%3Euser%0AHow+does+the+brain+work%3F%3Cend_of_turn%3E%0A%3Cstart_of_turn%3Emodel
+register_conv_template(
+    Conversation(
+        name="gemma",
+        roles=("user", "model"),
+        sep_style=SeparatorStyle.GEMMA,
+        sep="<end_of_turn>\n",
+        stop_str="<end_of_turn>",
+    )
+)
+register_conv_template(
+    Conversation(
+        name="yandexgpt",
+        system_message="",
+        roles=("user", "assistant"),
+        sep_style=None,
+        sep=None,
+    )
+)
+register_conv_template(
+    Conversation(
+        name="reka",
+        system_message="",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.DEFAULT,
+        sep=None,
+    )
+)
+if __name__ == "__main__":
+    from fastchat.conversation import get_conv_template
+    print("-- Vicuna template --")
+    conv = get_conv_template("vicuna_v1.1")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+    print("\n")
+    print("-- Llama-2 template --")
+    conv = get_conv_template("llama-2")
+    conv.set_system_message("You are a helpful, respectful and honest assistant.")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+    print("\n")
+    print("-- ChatGPT template --")
+    conv = get_conv_template("chatgpt")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.to_openai_api_messages())
+    print("\n")
+    print("-- Claude template --")
+    conv = get_conv_template("claude")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi!")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())

src/model/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from fastchat.model.model_adapter import (
+    load_model,
+    get_conversation_template,
+    add_model_args,
+)

src/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (312 Bytes). View file

src/model/__pycache__/compression.cpython-310.pyc ADDED Viewed

Binary file (6.93 kB). View file

src/model/__pycache__/llama_condense_monkey_patch.cpython-310.pyc ADDED Viewed

Binary file (2.12 kB). View file

src/model/__pycache__/model_adapter.cpython-310.pyc ADDED Viewed

Binary file (72.3 kB). View file

src/model/__pycache__/model_chatglm.cpython-310.pyc ADDED Viewed

Binary file (3.48 kB). View file

src/model/__pycache__/model_cllm.cpython-310.pyc ADDED Viewed

Binary file (4.09 kB). View file

src/model/__pycache__/model_codet5p.cpython-310.pyc ADDED Viewed

Binary file (2.6 kB). View file

src/model/__pycache__/model_exllama.cpython-310.pyc ADDED Viewed

Binary file (1.79 kB). View file

src/model/__pycache__/model_falcon.cpython-310.pyc ADDED Viewed

Binary file (2.58 kB). View file

src/model/__pycache__/model_registry.cpython-310.pyc ADDED Viewed

Binary file (18.1 kB). View file

src/model/__pycache__/model_xfastertransformer.cpython-310.pyc ADDED Viewed

Binary file (1.69 kB). View file

src/model/__pycache__/model_yuan2.cpython-310.pyc ADDED Viewed

Binary file (2.57 kB). View file

src/model/__pycache__/monkey_patch_non_inplace.cpython-310.pyc ADDED Viewed

Binary file (3.11 kB). View file

src/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Apply the delta weights on top of a base model.
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
+"""
+import argparse
+import gc
+import glob
+import json
+import os
+import shutil
+import tempfile
+from huggingface_hub import snapshot_download
+import torch
+from torch import nn
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+GB = 1 << 30
+def split_files(model_path, tmp_path, split_size):
+    if not os.path.exists(model_path):
+        model_path = snapshot_download(repo_id=model_path)
+    if not os.path.exists(tmp_path):
+        os.makedirs(tmp_path)
+    file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
+    files = glob.glob(file_pattern)
+    part = 0
+    try:
+        for file_path in tqdm(files):
+            state_dict = torch.load(file_path)
+            new_state_dict = {}
+            current_size = 0
+            for name, param in state_dict.items():
+                param_size = param.numel() * param.element_size()
+                if current_size + param_size > split_size:
+                    new_file_name = f"pytorch_model-{part}.bin"
+                    new_file_path = os.path.join(tmp_path, new_file_name)
+                    torch.save(new_state_dict, new_file_path)
+                    current_size = 0
+                    new_state_dict = None
+                    gc.collect()
+                    new_state_dict = {}
+                    part += 1
+                new_state_dict[name] = param
+                current_size += param_size
+            new_file_name = f"pytorch_model-{part}.bin"
+            new_file_path = os.path.join(tmp_path, new_file_name)
+            torch.save(new_state_dict, new_file_path)
+            new_state_dict = None
+            gc.collect()
+            new_state_dict = {}
+            part += 1
+    except Exception as e:
+        print(f"An error occurred during split_files: {e}")
+        shutil.rmtree(tmp_path)
+        raise
+def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
+    delta_config = AutoConfig.from_pretrained(delta_path)
+    if os.path.exists(target_model_path):
+        shutil.rmtree(target_model_path)
+    os.makedirs(target_model_path)
+    split_size = 4 * GB
+    with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
+        print(f"Split files for the base model to {tmp_base_path}")
+        split_files(base_model_path, tmp_base_path, split_size)
+        print(f"Split files for the delta weights to {tmp_delta_path}")
+        split_files(delta_path, tmp_delta_path, split_size)
+        base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
+        base_files = glob.glob(base_pattern)
+        delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
+        delta_files = glob.glob(delta_pattern)
+        delta_state_dict = torch.load(delta_files[0])
+        print("Applying the delta")
+        weight_map = {}
+        total_size = 0
+        for i, base_file in tqdm(enumerate(base_files)):
+            state_dict = torch.load(base_file)
+            file_name = f"pytorch_model-{i}.bin"
+            for name, param in state_dict.items():
+                if name not in delta_state_dict:
+                    for delta_file in delta_files:
+                        delta_state_dict = torch.load(delta_file)
+                        gc.collect()
+                        if name in delta_state_dict:
+                            break
+                state_dict[name] += delta_state_dict[name]
+                weight_map[name] = file_name
+                total_size += param.numel() * param.element_size()
+                gc.collect()
+            torch.save(state_dict, os.path.join(target_model_path, file_name))
+        with open(
+            os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
+        ) as f:
+            json.dump(
+                {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
+            )
+    print(f"Saving the target model to {target_model_path}")
+    delta_tokenizer.save_pretrained(target_model_path)
+    delta_config.save_pretrained(target_model_path)
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print(f"Loading the delta weights from {delta_path}")
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
+    delta = AutoModelForCausalLM.from_pretrained(
+        delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Applying the delta")
+    for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
+        assert name in delta.state_dict()
+        param.data += delta.state_dict()[name]
+    print(f"Saving the target model to {target_model_path}")
+    base.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument(
+        "--low-cpu-mem",
+        action="store_true",
+        help="Lower the cpu memory usage. This will split large files and use "
+        "disk as swap to reduce the memory usage below 10GB.",
+    )
+    args = parser.parse_args()
+    if args.low_cpu_mem:
+        apply_delta_low_cpu_mem(
+            args.base_model_path, args.target_model_path, args.delta_path
+        )
+    else:
+        apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

src/model/apply_lora.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Apply the LoRA weights on top of a base model.
+Usage:
+python3 -m fastchat.model.apply_lora --base ~/model_weights/llama-7b --target ~/model_weights/baize-7b --lora project-baize/baize-lora-7B
+Dependency:
+pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b
+"""
+import argparse
+import torch
+from peft import PeftModel
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def apply_lora(base_model_path, target_model_path, lora_path):
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    base_tokenizer = AutoTokenizer.from_pretrained(base_model_path, use_fast=False)
+    print(f"Loading the LoRA adapter from {lora_path}")
+    lora_model = PeftModel.from_pretrained(
+        base,
+        lora_path,
+        # torch_dtype=torch.float16
+    )
+    print("Applying the LoRA")
+    model = lora_model.merge_and_unload()
+    print(f"Saving the target model to {target_model_path}")
+    model.save_pretrained(target_model_path)
+    base_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--lora-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_lora(args.base_model_path, args.target_model_path, args.lora_path)

src/model/compression.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import dataclasses
+import gc
+import glob
+import os
+from accelerate import init_empty_weights
+from accelerate.utils import set_module_tensor_to_device
+from huggingface_hub import snapshot_download
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    AutoModel,
+    AutoModelForSeq2SeqLM,
+)
+@dataclasses.dataclass
+class CompressionConfig:
+    """Group-wise quantization."""
+    num_bits: int
+    group_size: int
+    group_dim: int
+    symmetric: bool
+    enabled: bool = True
+default_compression_config = CompressionConfig(
+    num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
+)
+class CLinear(nn.Module):
+    """Compressed Linear Layer."""
+    def __init__(self, weight=None, bias=None, device=None):
+        super().__init__()
+        if weight is None:
+            self.weight = None
+        elif isinstance(weight, Tensor):
+            self.weight = compress(weight.data.to(device), default_compression_config)
+        else:
+            self.weight = weight
+        self.bias = bias
+    def forward(self, input: Tensor) -> Tensor:
+        weight = decompress(self.weight, default_compression_config)
+        if self.bias is None:
+            return F.linear(input.to(weight.dtype), weight)
+        return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype))
+def compress_module(module, target_device):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            setattr(
+                module,
+                attr_str,
+                CLinear(target_attr.weight, target_attr.bias, target_device),
+            )
+    for name, child in module.named_children():
+        compress_module(child, target_device)
+def get_compressed_list(module, prefix=""):
+    compressed_list = []
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            full_name = (
+                f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
+            )
+            compressed_list.append(full_name)
+    for name, child in module.named_children():
+        child_prefix = f"{prefix}.{name}" if prefix else name
+        for each in get_compressed_list(child, child_prefix):
+            compressed_list.append(each)
+    return compressed_list
+def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
+    for attr_str in dir(module):
+        target_attr = getattr(module, attr_str)
+        if type(target_attr) == torch.nn.Linear:
+            full_name = (
+                f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
+            )
+            setattr(
+                module,
+                attr_str,
+                CLinear(
+                    compressed_state_dict[full_name], target_attr.bias, target_device
+                ),
+            )
+    for name, child in module.named_children():
+        child_prefix = f"{prefix}.{name}" if prefix else name
+        apply_compressed_weight(
+            child, compressed_state_dict, target_device, child_prefix
+        )
+def load_compress_model(model_path, device, torch_dtype, use_fast, revision="main"):
+    # partially load model
+    # `use_fast=True`` is not supported for some models.
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=use_fast, revision=revision, trust_remote_code=True
+        )
+    except TypeError:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=~use_fast, revision=revision, trust_remote_code=True
+        )
+    with init_empty_weights():
+        # `trust_remote_code` should be set as `True` for both AutoConfig and AutoModel
+        config = AutoConfig.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        # some models are loaded by AutoModel but not AutoModelForCausalLM,
+        # such as chatglm, chatglm2
+        try:
+            # google/flan-* models are based on an AutoModelForSeq2SeqLM.
+            if "T5Config" in str(type(config)):
+                model = AutoModelForSeq2SeqLM.from_config(
+                    config, trust_remote_code=True
+                )
+            else:
+                model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+        except NameError:
+            model = AutoModel.from_config(config, trust_remote_code=True)
+        linear_weights = get_compressed_list(model)
+    if os.path.exists(model_path):
+        # `model_path` is a local folder
+        base_pattern = os.path.join(model_path, "pytorch_model*.bin")
+    else:
+        # `model_path` is a cached Hugging Face repo
+        # We don't necessarily need to download the model' repo again if there is a cache.
+        # So check the default huggingface cache first.
+        model_path_temp = os.path.join(
+            os.path.expanduser("~"),
+            ".cache/huggingface/hub",
+            "models--" + model_path.replace("/", "--"),
+            "snapshots/",
+        )
+        downloaded = False
+        if os.path.exists(model_path_temp):
+            temp_last_dir = os.listdir(model_path_temp)[-1]
+            model_path_temp = os.path.join(model_path_temp, temp_last_dir)
+            base_pattern = os.path.join(model_path_temp, "pytorch_model*.bin")
+            files = glob.glob(base_pattern)
+            if len(files) > 0:
+                downloaded = True
+        if downloaded:
+            model_path = model_path_temp
+        else:
+            model_path = snapshot_download(model_path, revision=revision)
+        base_pattern = os.path.join(model_path, "pytorch_model*.bin")
+    files = glob.glob(base_pattern)
+    use_safetensors = False
+    if len(files) == 0:
+        base_pattern = os.path.join(model_path, "*.safetensors")
+        files = glob.glob(base_pattern)
+        use_safetensors = True
+    if len(files) == 0:
+        raise ValueError(
+            f"Cannot find any model weight files. "
+            f"Please check your (cached) weight path: {model_path}"
+        )
+    compressed_state_dict = {}
+    if use_safetensors:
+        from safetensors.torch import load_file
+    for filename in tqdm(files):
+        if use_safetensors:
+            tmp_state_dict = load_file(filename)
+        else:
+            tmp_state_dict = torch.load(
+                filename, map_location=lambda storage, loc: storage
+            )
+        for name in tmp_state_dict:
+            if name in linear_weights:
+                tensor = tmp_state_dict[name].to(device, dtype=torch_dtype)
+                compressed_state_dict[name] = compress(
+                    tensor, default_compression_config
+                )
+            else:
+                compressed_state_dict[name] = tmp_state_dict[name].to(
+                    device, dtype=torch_dtype
+                )
+            tmp_state_dict[name] = None
+            tensor = None
+            gc.collect()
+            torch.cuda.empty_cache()
+            if device == "xpu":
+                torch.xpu.empty_cache()
+            if device == "npu":
+                torch.npu.empty_cache()
+    for name in model.state_dict():
+        if name not in linear_weights:
+            set_module_tensor_to_device(
+                model, name, device, value=compressed_state_dict[name]
+            )
+    apply_compressed_weight(model, compressed_state_dict, device)
+    if torch_dtype == torch.float16:
+        model.half()
+    model.to(device)
+    model.eval()
+    return model, tokenizer
+def compress(tensor, config):
+    """Simulate group-wise quantization."""
+    if not config.enabled:
+        return tensor
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    assert num_bits <= 8
+    original_shape = tensor.shape
+    num_groups = (original_shape[group_dim] + group_size - 1) // group_size
+    new_shape = (
+        original_shape[:group_dim]
+        + (num_groups, group_size)
+        + original_shape[group_dim + 1 :]
+    )
+    # Pad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len != 0:
+        pad_shape = (
+            original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
+        )
+        tensor = torch.cat(
+            [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
+            dim=group_dim,
+        )
+    data = tensor.view(new_shape)
+    # Quantize
+    if symmetric:
+        B = 2 ** (num_bits - 1) - 1
+        scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
+        data = data * scale
+        data = data.clamp_(-B, B).round_().to(torch.int8)
+        return data, scale, original_shape
+    else:
+        B = 2**num_bits - 1
+        mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
+        mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
+        scale = B / (mx - mn)
+        data = data - mn
+        data.mul_(scale)
+        data = data.clamp_(0, B).round_().to(torch.uint8)
+        return data, mn, scale, original_shape
+def decompress(packed_data, config):
+    """Simulate group-wise dequantization."""
+    if not config.enabled:
+        return packed_data
+    group_size, num_bits, group_dim, symmetric = (
+        config.group_size,
+        config.num_bits,
+        config.group_dim,
+        config.symmetric,
+    )
+    # Dequantize
+    if symmetric:
+        data, scale, original_shape = packed_data
+        data = data / scale
+    else:
+        data, mn, scale, original_shape = packed_data
+        data = data / scale
+        data.add_(mn)
+    # Unpad
+    pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
+    if pad_len:
+        padded_original_shape = (
+            original_shape[:group_dim]
+            + (original_shape[group_dim] + pad_len,)
+            + original_shape[group_dim + 1 :]
+        )
+        data = data.reshape(padded_original_shape)
+        indices = [slice(0, x) for x in original_shape]
+        return data[indices].contiguous()
+    else:
+        return data.view(original_shape)

src/model/convert_fp16.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+Usage:
+python3 -m fastchat.model.convert_fp16 --in in-folder --out out-folder
+"""
+import argparse
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+def convert_fp16(in_checkpoint, out_checkpoint):
+    tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
+    model = AutoModelForCausalLM.from_pretrained(
+        in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    model.save_pretrained(out_checkpoint)
+    tokenizer.save_pretrained(out_checkpoint)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
+    parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
+    args = parser.parse_args()
+    convert_fp16(args.in_checkpoint, args.out_checkpoint)

src/model/llama_condense_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py
+from functools import partial
+import torch
+import transformers
+import transformers.models.llama.modeling_llama
+class CondenseRotaryEmbedding(torch.nn.Module):
+    def __init__(
+        self, dim, ratio, max_position_embeddings=2048, base=10000, device=None
+    ):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self.ratio = ratio
+        max_position_embeddings *= ratio
+        self.max_seq_len_cached = max_position_embeddings
+        # print(f"Monkey Patching condense ratio {ratio}")
+        t = (
+            torch.arange(
+                self.max_seq_len_cached,
+                device=self.inv_freq.device,
+                dtype=self.inv_freq.dtype,
+            )
+            / ratio
+        )
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        dtype = torch.get_default_dtype()
+        self.register_buffer(
+            "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False
+        )
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = (
+                torch.arange(
+                    self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
+                )
+                / self.ratio
+            )
+            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer(
+                "cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False
+            )
+            self.register_buffer(
+                "sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False
+            )
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+def replace_llama_with_condense(ratio):
+    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(
+        CondenseRotaryEmbedding, ratio=ratio
+    )

src/model/make_delta.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+Make the delta weights by subtracting base weights.
+Usage:
+python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def make_delta(base_model_path, target_model_path, delta_path):
+    print(f"Loading the base model from {base_model_path}")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print(f"Loading the target model from {target_model_path}")
+    target = AutoModelForCausalLM.from_pretrained(
+        target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
+    print("Calculating the delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        assert name in base.state_dict()
+        param.data -= base.state_dict()[name]
+    print(f"Saving the delta to {delta_path}")
+    if args.hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str)
+    args = parser.parse_args()
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path)

src/model/model_adapter.py ADDED Viewed

	@@ -0,0 +1,2524 @@

+"""Model adapter registration."""
+import math
+import os
+import re
+import sys
+from typing import Dict, List, Optional
+import warnings
+if sys.version_info >= (3, 9):
+    from functools import cache
+else:
+    from functools import lru_cache as cache
+import psutil
+import torch
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    LlamaTokenizer,
+    LlamaForCausalLM,
+    T5Tokenizer,
+)
+from src.constants import CPU_ISA
+from src.conversation import Conversation, get_conv_template
+from src.model.compression import load_compress_model
+from src.model.llama_condense_monkey_patch import replace_llama_with_condense
+from src.model.model_chatglm import generate_stream_chatglm
+from src.model.model_codet5p import generate_stream_codet5p
+from src.model.model_falcon import generate_stream_falcon
+from src.model.model_yuan2 import generate_stream_yuan2
+from src.model.model_exllama import generate_stream_exllama
+from src.model.model_xfastertransformer import generate_stream_xft
+from src.model.model_cllm import generate_stream_cllm
+from src.model.monkey_patch_non_inplace import (
+    replace_llama_attn_with_non_inplace_operations,
+)
+from src.modules.awq import AWQConfig, load_awq_quantized
+from src.modules.exllama import ExllamaConfig, load_exllama_model
+from src.modules.xfastertransformer import load_xft_model, XftConfig
+from src.modules.gptq import GptqConfig, load_gptq_quantized
+from src.utils import get_gpu_memory
+# Check an environment variable to check if we should be sharing Peft model
+# weights.  When false we treat all Peft models as separate.
+peft_share_base_weights = (
+    os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true"
+)
+ANTHROPIC_MODEL_LIST = (
+    "claude-1",
+    "claude-2",
+    "claude-2.0",
+    "claude-2.1",
+    "claude-3-haiku-20240307",
+    "claude-3-haiku-20240307-vertex",
+    "claude-3-sonnet-20240229",
+    "claude-3-sonnet-20240229-vertex",
+    "claude-3-opus-20240229",
+    "claude-instant-1",
+    "claude-instant-1.2",
+)
+OPENAI_MODEL_LIST = (
+    "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0301",
+    "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-1106",
+    "gpt-3.5-turbo-0125",
+    "gpt-4",
+    "gpt-4-0314",
+    "gpt-4-0613",
+    "gpt-4-turbo",
+    "gpt-4-1106-preview",
+    "gpt-4-0125-preview",
+    "gpt-4-turbo-browsing",
+    "gpt-4-turbo-2024-04-09",
+)
+class BaseModelAdapter:
+    """The base and the default model adapter."""
+    use_fast_tokenizer = True
+    def match(self, model_path: str):
+        return True
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                use_fast=self.use_fast_tokenizer,
+                revision=revision,
+                trust_remote_code=True,
+            )
+        except TypeError:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path, use_fast=False, revision=revision, trust_remote_code=True
+            )
+        try:
+            model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                **from_pretrained_kwargs,
+            )
+        except NameError:
+            model = AutoModel.from_pretrained(
+                model_path,
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+                **from_pretrained_kwargs,
+            )
+        return model, tokenizer
+    def load_compress_model(self, model_path, device, torch_dtype, revision="main"):
+        return load_compress_model(
+            model_path,
+            device,
+            torch_dtype,
+            use_fast=self.use_fast_tokenizer,
+            revision=revision,
+        )
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("one_shot")
+# A global registry for all model adapters
+# TODO (lmzheng): make it a priority queue.
+model_adapters: List[BaseModelAdapter] = []
+def register_model_adapter(cls):
+    """Register a model adapter."""
+    model_adapters.append(cls())
+@cache
+def get_model_adapter(model_path: str) -> BaseModelAdapter:
+    """Get a model adapter for a model_path."""
+    model_path_basename = os.path.basename(os.path.normpath(model_path))
+    # Try the basename of model_path at first
+    for adapter in model_adapters:
+        if adapter.match(model_path_basename) and type(adapter) != BaseModelAdapter:
+            return adapter
+    # Then try the full path
+    for adapter in model_adapters:
+        if adapter.match(model_path):
+            return adapter
+    raise ValueError(f"No valid model adapter for {model_path}")
+def raise_warning_for_incompatible_cpu_offloading_configuration(
+    device: str, load_8bit: bool, cpu_offloading: bool
+):
+    if cpu_offloading:
+        if not load_8bit:
+            warnings.warn(
+                "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
+                "Use '--load-8bit' to enable 8-bit-quantization\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if not "linux" in sys.platform:
+            warnings.warn(
+                "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+        if device != "cuda":
+            warnings.warn(
+                "CPU-offloading is only enabled when using CUDA-devices\n"
+                "Continuing without cpu-offloading enabled\n"
+            )
+            return False
+    return cpu_offloading
+def load_model(
+    model_path: str,
+    device: str = "cuda",
+    num_gpus: int = 1,
+    max_gpu_memory: Optional[str] = None,
+    dtype: Optional[torch.dtype] = None,
+    load_8bit: bool = False,
+    cpu_offloading: bool = False,
+    gptq_config: Optional[GptqConfig] = None,
+    awq_config: Optional[AWQConfig] = None,
+    exllama_config: Optional[ExllamaConfig] = None,
+    xft_config: Optional[XftConfig] = None,
+    revision: str = "main",
+    debug: bool = False,
+):
+    """Load a model from Hugging Face."""
+    import accelerate
+    # get model adapter
+    adapter = get_model_adapter(model_path)
+    # Handle device mapping
+    cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
+        device, load_8bit, cpu_offloading
+    )
+    if device == "cpu":
+        kwargs = {"torch_dtype": torch.float32}
+        if CPU_ISA in ["avx512_bf16", "amx"]:
+            try:
+                import intel_extension_for_pytorch as ipex
+                kwargs = {"torch_dtype": torch.bfloat16}
+            except ImportError:
+                warnings.warn(
+                    "Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference"
+                )
+    elif device == "cuda":
+        kwargs = {"torch_dtype": torch.float16}
+        if num_gpus != 1:
+            kwargs["device_map"] = "auto"
+            if max_gpu_memory is None:
+                kwargs[
+                    "device_map"
+                ] = "sequential"  # This is important for not the same VRAM sizes
+                available_gpu_memory = get_gpu_memory(num_gpus)
+                kwargs["max_memory"] = {
+                    i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
+                    for i in range(num_gpus)
+                }
+            else:
+                kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
+    elif device == "mps":
+        kwargs = {"torch_dtype": torch.float16}
+        import transformers
+        version = tuple(int(v) for v in transformers.__version__.split("."))
+        if version < (4, 35, 0):
+            # NOTE: Recent transformers library seems to fix the mps issue, also
+            # it has made some changes causing compatibility issues with our
+            # original patch. So we only apply the patch for older versions.
+            # Avoid bugs in mps backend by not using in-place operations.
+            replace_llama_attn_with_non_inplace_operations()
+    elif device == "xpu":
+        kwargs = {"torch_dtype": torch.bfloat16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import intel_extension_for_pytorch as ipex
+        except ImportError:
+            warnings.warn(
+                "Intel Extension for PyTorch is not installed, but is required for xpu inference."
+            )
+    elif device == "npu":
+        kwargs = {"torch_dtype": torch.float16}
+        # Try to load ipex, while it looks unused, it links into torch for xpu support
+        try:
+            import torch_npu
+        except ImportError:
+            warnings.warn("Ascend Extension for PyTorch is not installed.")
+    else:
+        raise ValueError(f"Invalid device: {device}")
+    if cpu_offloading:
+        # raises an error on incompatible platforms
+        from transformers import BitsAndBytesConfig
+        if "max_memory" in kwargs:
+            kwargs["max_memory"]["cpu"] = (
+                str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
+            )
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_8bit_fp32_cpu_offload=cpu_offloading
+        )
+        kwargs["load_in_8bit"] = load_8bit
+    elif load_8bit:
+        if num_gpus != 1:
+            warnings.warn(
+                "8-bit quantization is not supported for multi-gpu inference."
+            )
+        else:
+            model, tokenizer = adapter.load_compress_model(
+                model_path=model_path,
+                device=device,
+                torch_dtype=kwargs["torch_dtype"],
+                revision=revision,
+            )
+            if debug:
+                print(model)
+            return model, tokenizer
+    elif awq_config and awq_config.wbits < 16:
+        assert (
+            awq_config.wbits == 4
+        ), "Currently we only support 4-bit inference for AWQ."
+        model, tokenizer = load_awq_quantized(model_path, awq_config, device)
+        if num_gpus != 1:
+            device_map = accelerate.infer_auto_device_map(
+                model,
+                max_memory=kwargs["max_memory"],
+                no_split_module_classes=[
+                    "OPTDecoderLayer",
+                    "LlamaDecoderLayer",
+                    "BloomBlock",
+                    "MPTBlock",
+                    "DecoderLayer",
+                ],
+            )
+            model = accelerate.dispatch_model(
+                model, device_map=device_map, offload_buffers=True
+            )
+        else:
+            model.to(device)
+        return model, tokenizer
+    elif gptq_config and gptq_config.wbits < 16:
+        model, tokenizer = load_gptq_quantized(model_path, gptq_config)
+        if num_gpus != 1:
+            device_map = accelerate.infer_auto_device_map(
+                model,
+                max_memory=kwargs["max_memory"],
+                no_split_module_classes=["LlamaDecoderLayer"],
+            )
+            model = accelerate.dispatch_model(
+                model, device_map=device_map, offload_buffers=True
+            )
+        else:
+            model.to(device)
+        return model, tokenizer
+    elif exllama_config:
+        model, tokenizer = load_exllama_model(model_path, exllama_config)
+        return model, tokenizer
+    elif xft_config:
+        model, tokenizer = load_xft_model(model_path, xft_config)
+        return model, tokenizer
+    kwargs["revision"] = revision
+    if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
+        kwargs["torch_dtype"] = dtype
+    if os.environ.get("FASTCHAT_USE_MODELSCOPE", "False").lower() == "true":
+        # download model from ModelScope hub,
+        # lazy import so that modelscope is not required for normal use.
+        try:
+            from modelscope.hub.snapshot_download import snapshot_download
+            if not os.path.exists(model_path):
+                model_path = snapshot_download(model_id=model_path, revision=revision)
+        except ImportError as e:
+            warnings.warn(
+                "Use model from www.modelscope.cn need pip install modelscope"
+            )
+            raise e
+    # Load model
+    model, tokenizer = adapter.load_model(model_path, kwargs)
+    if (
+        device == "cpu"
+        and kwargs["torch_dtype"] is torch.bfloat16
+        and CPU_ISA is not None
+    ):
+        model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
+    if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
+        "mps",
+        "xpu",
+        "npu",
+    ):
+        model.to(device)
+    if device == "xpu":
+        model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
+    if debug:
+        print(model)
+    return model, tokenizer
+def get_conversation_template(model_path: str) -> Conversation:
+    """Get the default conversation template."""
+    adapter = get_model_adapter(model_path)
+    return adapter.get_default_conv_template(model_path)
+def get_generate_stream_function(model: torch.nn.Module, model_path: str):
+    """Get the generate_stream function for inference."""
+    from fastchat.serve.inference import generate_stream
+    model_type = str(type(model)).lower()
+    is_peft = "peft" in model_type
+    is_chatglm = "chatglm" in model_type
+    is_falcon = "rwforcausallm" in model_type
+    is_codet5p = "codet5p" in model_type
+    is_exllama = "exllama" in model_type
+    is_xft = "xft" in model_type
+    is_yuan = "yuan" in model_type
+    is_cllm = "consistency-llm" in model_path.lower()
+    if is_chatglm:
+        return generate_stream_chatglm
+    elif is_falcon:
+        return generate_stream_falcon
+    elif is_codet5p:
+        return generate_stream_codet5p
+    elif is_exllama:
+        return generate_stream_exllama
+    elif is_xft:
+        return generate_stream_xft
+    elif is_yuan:
+        return generate_stream_yuan2
+    elif is_cllm:
+        return generate_stream_cllm
+    elif peft_share_base_weights and is_peft:
+        # Return a curried stream function that loads the right adapter
+        # according to the model_name available in this context.  This ensures
+        # the right weights are available.
+        @torch.inference_mode()
+        def generate_stream_peft(
+            model,
+            tokenizer,
+            params: Dict,
+            device: str,
+            context_len: int,
+            stream_interval: int = 2,
+            judge_sent_end: bool = False,
+        ):
+            model.set_adapter(model_path)
+            base_model_type = str(type(model.base_model.model))
+            is_chatglm = "chatglm" in base_model_type
+            is_falcon = "rwforcausallm" in base_model_type
+            is_codet5p = "codet5p" in base_model_type
+            is_exllama = "exllama" in base_model_type
+            is_xft = "xft" in base_model_type
+            is_yuan = "yuan" in base_model_type
+            is_cllm = "consistency-llm" in model_path.lower()
+            generate_stream_function = generate_stream
+            if is_chatglm:
+                generate_stream_function = generate_stream_chatglm
+            elif is_falcon:
+                generate_stream_function = generate_stream_falcon
+            elif is_codet5p:
+                generate_stream_function = generate_stream_codet5p
+            elif is_exllama:
+                generate_stream_function = generate_stream_exllama
+            elif is_xft:
+                generate_stream_function = generate_stream_xft
+            elif is_yuan:
+                generate_stream_function = generate_stream_yuan2
+            elif is_cllm:
+                generate_stream_function = generate_stream_cllm
+            for x in generate_stream_function(
+                model,
+                tokenizer,
+                params,
+                device,
+                context_len,
+                stream_interval,
+                judge_sent_end,
+            ):
+                yield x
+        return generate_stream_peft
+    else:
+        return generate_stream
+def add_model_args(parser):
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        default="lmsys/vicuna-7b-v1.5",
+        help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default="main",
+        help="Hugging Face Hub model revision identifier",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        choices=["cpu", "cuda", "mps", "xpu", "npu"],
+        default="cuda",
+        help="The device type",
+    )
+    parser.add_argument(
+        "--gpus",
+        type=str,
+        default=None,
+        help="A single GPU like 1 or multiple GPUs like 0,2",
+    )
+    parser.add_argument("--num-gpus", type=int, default=1)
+    parser.add_argument(
+        "--max-gpu-memory",
+        type=str,
+        help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=["float32", "float16", "bfloat16"],
+        help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
+        default=None,
+    )
+    parser.add_argument(
+        "--load-8bit", action="store_true", help="Use 8-bit quantization"
+    )
+    parser.add_argument(
+        "--cpu-offloading",
+        action="store_true",
+        help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
+    )
+    parser.add_argument(
+        "--gptq-ckpt",
+        type=str,
+        default=None,
+        help="Used for GPTQ. The path to the local GPTQ checkpoint.",
+    )
+    parser.add_argument(
+        "--gptq-wbits",
+        type=int,
+        default=16,
+        choices=[2, 3, 4, 8, 16],
+        help="Used for GPTQ. #bits to use for quantization",
+    )
+    parser.add_argument(
+        "--gptq-groupsize",
+        type=int,
+        default=-1,
+        help="Used for GPTQ. Groupsize to use for quantization; default uses full row.",
+    )
+    parser.add_argument(
+        "--gptq-act-order",
+        action="store_true",
+        help="Used for GPTQ. Whether to apply the activation order GPTQ heuristic",
+    )
+    parser.add_argument(
+        "--awq-ckpt",
+        type=str,
+        default=None,
+        help="Used for AWQ. Load quantized model. The path to the local AWQ checkpoint.",
+    )
+    parser.add_argument(
+        "--awq-wbits",
+        type=int,
+        default=16,
+        choices=[4, 16],
+        help="Used for AWQ. #bits to use for AWQ quantization",
+    )
+    parser.add_argument(
+        "--awq-groupsize",
+        type=int,
+        default=-1,
+        help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
+    )
+    parser.add_argument(
+        "--enable-exllama",
+        action="store_true",
+        help="Used for exllamabv2. Enable exllamaV2 inference framework.",
+    )
+    parser.add_argument(
+        "--exllama-max-seq-len",
+        type=int,
+        default=4096,
+        help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
+    )
+    parser.add_argument(
+        "--exllama-gpu-split",
+        type=str,
+        default=None,
+        help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
+    )
+    parser.add_argument(
+        "--exllama-cache-8bit",
+        action="store_true",
+        help="Used for exllamabv2. Use 8-bit cache to save VRAM.",
+    )
+    parser.add_argument(
+        "--enable-xft",
+        action="store_true",
+        help="Used for xFasterTransformer Enable xFasterTransformer inference framework.",
+    )
+    parser.add_argument(
+        "--xft-max-seq-len",
+        type=int,
+        default=4096,
+        help="Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length.",
+    )
+    parser.add_argument(
+        "--xft-dtype",
+        type=str,
+        choices=["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"],
+        help="Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.",
+        default=None,
+    )
+def remove_parent_directory_name(model_path):
+    """Remove parent directory name."""
+    if model_path[-1] == "/":
+        model_path = model_path[:-1]
+    return model_path.split("/")[-1]
+peft_model_cache = {}
+class PeftModelAdapter:
+    """Loads any "peft" model and it's base model."""
+    def match(self, model_path: str):
+        """Accepts any model path with "peft" in the name"""
+        if os.path.exists(os.path.join(model_path, "adapter_config.json")):
+            return True
+        return "peft" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        """Loads the base model then the (peft) adapter weights"""
+        from peft import PeftConfig, PeftModel
+        config = PeftConfig.from_pretrained(model_path)
+        base_model_path = config.base_model_name_or_path
+        if "peft" in base_model_path:
+            raise ValueError(
+                f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
+            )
+        # Basic proof of concept for loading peft adapters that share the base
+        # weights.  This is pretty messy because Peft re-writes the underlying
+        # base model and internally stores a map of adapter layers.
+        # So, to make this work we:
+        #  1. Cache the first peft model loaded for a given base models.
+        #  2. Call `load_model` for any follow on Peft models.
+        #  3. Make sure we load the adapters by the model_path.  Why? This is
+        #  what's accessible during inference time.
+        #  4. In get_generate_stream_function, make sure we load the right
+        #  adapter before doing inference.  This *should* be safe when calls
+        #  are blocked the same semaphore.
+        if peft_share_base_weights:
+            if base_model_path in peft_model_cache:
+                model, tokenizer = peft_model_cache[base_model_path]
+                # Super important: make sure we use model_path as the
+                # `adapter_name`.
+                model.load_adapter(model_path, adapter_name=model_path)
+            else:
+                base_adapter = get_model_adapter(base_model_path)
+                base_model, tokenizer = base_adapter.load_model(
+                    base_model_path, from_pretrained_kwargs
+                )
+                # Super important: make sure we use model_path as the
+                # `adapter_name`.
+                model = PeftModel.from_pretrained(
+                    base_model, model_path, adapter_name=model_path
+                )
+                peft_model_cache[base_model_path] = (model, tokenizer)
+            return model, tokenizer
+        # In the normal case, load up the base model weights again.
+        base_adapter = get_model_adapter(base_model_path)
+        base_model, tokenizer = base_adapter.load_model(
+            base_model_path, from_pretrained_kwargs
+        )
+        model = PeftModel.from_pretrained(base_model, model_path)
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        """Uses the conv template of the base model"""
+        from peft import PeftConfig, PeftModel
+        config = PeftConfig.from_pretrained(model_path)
+        if "peft" in config.base_model_name_or_path:
+            raise ValueError(
+                f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
+            )
+        base_model_path = config.base_model_name_or_path
+        base_adapter = get_model_adapter(base_model_path)
+        return base_adapter.get_default_conv_template(config.base_model_name_or_path)
+class VicunaAdapter(BaseModelAdapter):
+    "Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "vicuna" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        self.raise_warning_for_old_weights(model)
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "v0" in remove_parent_directory_name(model_path):
+            return get_conv_template("one_shot")
+        return get_conv_template("vicuna_v1.1")
+    def raise_warning_for_old_weights(self, model):
+        if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
+            warnings.warn(
+                "\nYou are probably using the old Vicuna-v0 model, "
+                "which will generate unexpected results with the "
+                "current fastchat.\nYou can try one of the following methods:\n"
+                "1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
+                "2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template one_shot`\n"
+                "3. Downgrade fschat to fschat==0.1.10 (Not recommended).\n"
+            )
+class AiroborosAdapter(BaseModelAdapter):
+    """The model adapter for jondurbin/airoboros-*"""
+    def match(self, model_path: str):
+        if re.search(r"airoboros|spicyboros", model_path, re.I):
+            return True
+        return False
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "-3." in model_path or "-3p" in model_path:
+            return get_conv_template("airoboros_v3")
+        if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
+            return get_conv_template("airoboros_v2")
+        return get_conv_template("airoboros_v1")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        if "mpt" not in model_path.lower():
+            return super().load_model(model_path, from_pretrained_kwargs)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            max_seq_len=8192,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, use_fast=True
+        )
+        return model, tokenizer
+class LongChatAdapter(BaseModelAdapter):
+    "Model adapter for LongChat models (e.g., lmsys/longchat-7b-16k)."
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "longchat" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        # Apply monkey patch, TODO(Dacheng): Add flash attention support
+        config = AutoConfig.from_pretrained(model_path, revision=revision)
+        replace_llama_with_condense(config.rope_scaling["factor"])
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna_v1.1")
+class GoogleT5Adapter(BaseModelAdapter):
+    """The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
+    def match(self, model_path: str):
+        return any(
+            model_str in model_path.lower()
+            for model_str in ["flan-", "fastchat-t5", "codet5p"]
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForSeq2SeqLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+class KoalaAdapter(BaseModelAdapter):
+    """The model adapter for Koala"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "koala" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("koala_v1")
+class AlpacaAdapter(BaseModelAdapter):
+    """The model adapter for Alpaca"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "alpaca" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("alpaca")
+class ChatGLMAdapter(BaseModelAdapter):
+    """The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b"""
+    def match(self, model_path: str):
+        return "chatglm" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        if "chatglm3" in model_path.lower():
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path,
+                encode_special_tokens=True,
+                trust_remote_code=True,
+                revision=revision,
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                model_path, trust_remote_code=True, revision=revision
+            )
+        model = AutoModel.from_pretrained(
+            model_path, trust_remote_code=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "chatglm2" in model_path.lower():
+            return get_conv_template("chatglm2")
+        if "chatglm3" in model_path.lower():
+            return get_conv_template("chatglm3")
+        return get_conv_template("chatglm")
+class CodeGeexAdapter(BaseModelAdapter):
+    """The model adapter for THUDM/codegeex-6b, THUDM/codegeex2-6b"""
+    def match(self, model_path: str):
+        return "codegeex" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model = AutoModel.from_pretrained(
+            model_path, trust_remote_code=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("codegeex")
+class DollyV2Adapter(BaseModelAdapter):
+    """The model adapter for databricks/dolly-v2-12b"""
+    def match(self, model_path: str):
+        return "dolly-v2" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        # 50277 means "### End"
+        tokenizer.eos_token_id = 50277
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("dolly_v2")
+class OasstPythiaAdapter(BaseModelAdapter):
+    """The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"""
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "oasst" in model_path and "pythia" in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("oasst_pythia")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+class OasstLLaMAAdapter(BaseModelAdapter):
+    """The model adapter for OpenAssistant/oasst-sft-7-llama-30b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        if "openassistant-sft-7-llama-30b-hf" in model_path:
+            return True
+        return "oasst" in model_path and "pythia" not in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("oasst_llama")
+class OpenChat35Adapter(BaseModelAdapter):
+    """The model adapter for OpenChat 3.5 (e.g. openchat/openchat_3.5)"""
+    def match(self, model_path: str):
+        if "openchat" in model_path.lower() and "3.5" in model_path.lower():
+            return True
+        elif "starling-lm" in model_path.lower():
+            return True
+        return False
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("openchat_3.5")
+class TenyxChatAdapter(BaseModelAdapter):
+    """The model adapter for TenyxChat (e.g. tenyx/TenyxChat-7B-v1)"""
+    def match(self, model_path: str):
+        return "tenyxchat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tenyxchat")
+class PythiaAdapter(BaseModelAdapter):
+    """The model adapter for any EleutherAI/pythia model"""
+    def match(self, model_path: str):
+        return "pythia" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+class StableLMAdapter(BaseModelAdapter):
+    """The model adapter for StabilityAI/stablelm-tuned-alpha-7b"""
+    def match(self, model_path: str):
+        return "stablelm" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("stablelm")
+class MPTAdapter(BaseModelAdapter):
+    """The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)"""
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "mpt" in model_path and not "airoboros" in model_path
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            max_seq_len=8192,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "mpt-7b-chat" in model_path:
+            return get_conv_template("mpt-7b-chat")
+        elif "mpt-30b-chat" in model_path:
+            return get_conv_template("mpt-30b-chat")
+        elif "mpt-30b-instruct" in model_path:
+            return get_conv_template("mpt-30b-instruct")
+        else:
+            print(
+                "Warning: Loading base MPT model with `zero_shot` conversation configuration.  "
+                "If this is not desired, inspect model configurations and names."
+            )
+            return get_conv_template("zero_shot")
+class BaizeAdapter(BaseModelAdapter):
+    """The model adapter for project-baize/baize-v2-7b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "baize" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("baize")
+class RwkvAdapter(BaseModelAdapter):
+    """The model adapter for BlinkDL/RWKV-4-Raven"""
+    def match(self, model_path: str):
+        return "rwkv-4" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        from fastchat.model.rwkv_model import RwkvModel
+        model = RwkvModel(model_path)
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            "EleutherAI/pythia-160m", revision=revision
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("rwkv")
+class OpenBuddyAdapter(BaseModelAdapter):
+    """The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "openbuddy" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("openbuddy")
+class PhoenixAdapter(BaseModelAdapter):
+    """The model adapter for FreedomIntelligence/phoenix-inst-chat-7b"""
+    def match(self, model_path: str):
+        return "phoenix" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("phoenix")
+class ReaLMAdapter(BaseModelAdapter):
+    """The model adapter for FreedomIntelligence/ReaLM-7b"""
+    def match(self, model_path: str):
+        return "ReaLM" in model_path
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("ReaLM-7b-v1")
+class ChatGPTAdapter(BaseModelAdapter):
+    """The model adapter for ChatGPT"""
+    def match(self, model_path: str):
+        return model_path in OPENAI_MODEL_LIST
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "browsing" in model_path:
+            return get_conv_template("api_based_default")
+        if "gpt-4-turbo-2024-04-09" in model_path:
+            return get_conv_template("gpt-4-turbo-2024-04-09")
+        return get_conv_template("chatgpt")
+class AzureOpenAIAdapter(BaseModelAdapter):
+    """The model adapter for Azure OpenAI"""
+    def match(self, model_path: str):
+        return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("chatgpt")
+class PplxAIAdapter(BaseModelAdapter):
+    """The model adapter for Perplexity AI"""
+    def match(self, model_path: str):
+        return model_path in (
+            "pplx-7b-online",
+            "pplx-70b-online",
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("pplxai")
+class ClaudeAdapter(BaseModelAdapter):
+    """The model adapter for Claude"""
+    def match(self, model_path: str):
+        return model_path in ANTHROPIC_MODEL_LIST
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "claude-3-haiku" in model_path:
+            return get_conv_template("claude-3-haiku-20240307")
+        if "claude-3-sonnet" in model_path:
+            return get_conv_template("claude-3-sonnet-20240229")
+        if "claude-3-opus" in model_path:
+            return get_conv_template("claude-3-opus-20240229")
+        return get_conv_template("claude")
+class BardAdapter(BaseModelAdapter):
+    """The model adapter for Bard"""
+    def match(self, model_path: str):
+        return model_path == "bard"
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("bard")
+class PaLM2Adapter(BaseModelAdapter):
+    """The model adapter for PaLM2"""
+    def match(self, model_path: str):
+        return model_path == "palm-2"
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("bard")
+class GeminiAdapter(BaseModelAdapter):
+    """The model adapter for Gemini"""
+    def match(self, model_path: str):
+        return "gemini" in model_path.lower() or "bard" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("gemini")
+class GeminiDevAdapter(BaseModelAdapter):
+    """The model adapter for Gemini 1.5 Pro"""
+    def match(self, model_path: str):
+        return "gemini-1.5-pro" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("gemini-dev")
+class BiLLaAdapter(BaseModelAdapter):
+    """The model adapter for Neutralzz/BiLLa-7B-SFT"""
+    def match(self, model_path: str):
+        return "billa" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("billa")
+class RedPajamaINCITEAdapter(BaseModelAdapter):
+    """The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat"""
+    def match(self, model_path: str):
+        return "redpajama-incite" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("redpajama-incite")
+class H2OGPTAdapter(BaseModelAdapter):
+    """The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "h2ogpt" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("h2ogpt")
+class RobinAdapter(BaseModelAdapter):
+    """The model adapter for LMFlow/Full-Robin-7b-v2"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "robin" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("Robin")
+class SnoozyAdapter(BaseModelAdapter):
+    """The model adapter for nomic-ai/gpt4all-13b-snoozy"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "gpt4all" in model_path and "snoozy" in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("snoozy")
+class WizardLMAdapter(BaseModelAdapter):
+    """The model adapter for WizardLM/WizardLM-13B-V1.0"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "wizardlm" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "13b" in model_path or "30b" in model_path or "70b" in model_path:
+            return get_conv_template("vicuna_v1.1")
+        else:
+            # TODO: use the recommended template for 7B
+            # (https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
+            return get_conv_template("one_shot")
+class ManticoreAdapter(BaseModelAdapter):
+    """The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "manticore" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("manticore")
+class GuanacoAdapter(BaseModelAdapter):
+    """The model adapter for timdettmers/guanaco-33b-merged"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "guanaco" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        # Fix a bug in tokenizer config
+        tokenizer.eos_token_id = model.config.eos_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("zero_shot")
+class ChangGPTAdapter(BaseModelAdapter):
+    """The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat"""
+    def match(self, model_path: str):
+        model_path = model_path.lower()
+        return "polyglot" in model_path and "chang" in model_path
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("polyglot_changgpt")
+class CamelAdapter(BaseModelAdapter):
+    """The model adapter for camel-ai/CAMEL-13B-Combined-Data"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "camel" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna_v1.1")
+class TuluAdapter(BaseModelAdapter):
+    """The model adapter for allenai/tulu-30b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "tulu" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tulu")
+class FalconAdapter(BaseModelAdapter):
+    """The model adapter for tiiuae/falcon-40b"""
+    def match(self, model_path: str):
+        return "falcon" in model_path.lower() and "chat" not in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        # Strongly suggest using bf16, which is recommended by the author of Falcon
+        tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        # In Falcon tokenizer config and special config there is not any pad token
+        # Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<'
+        tokenizer.pad_token_id = 9
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("falcon")
+class FalconChatAdapter(BaseModelAdapter):
+    def match(self, model_path: str):
+        return "falcon" in model_path.lower() and "chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("falcon-chat")
+class TigerBotAdapter(BaseModelAdapter):
+    """The model adapter for TigerResearch/tigerbot-7b-sft"""
+    def match(self, model_path: str):
+        return "tigerbot" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("tigerbot")
+class BaichuanAdapter(BaseModelAdapter):
+    """The model adapter for Baichuan models (e.g., baichuan-inc/Baichuan-7B)"""
+    def match(self, model_path: str):
+        return "baichuan" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        # for Baichuan-13B-Chat
+        if "chat" in model_path.lower():
+            if "baichuan2" in model_path.lower():
+                return get_conv_template("baichuan2-chat")
+            return get_conv_template("baichuan-chat")
+        return get_conv_template("zero_shot")
+class XGenAdapter(BaseModelAdapter):
+    """The model adapter for Salesforce/xgen-7b"""
+    def match(self, model_path: str):
+        return "xgen" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        model.config.eos_token_id = 50256
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("xgen")
+class NousHermesAdapter(BaseModelAdapter):
+    """The model adapter for NousResearch/Nous-Hermes-13b"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "nous-hermes" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("alpaca")
+class InternLMChatAdapter(BaseModelAdapter):
+    """The model adapter for internlm/internlm-chat-7b"""
+    def match(self, model_path: str):
+        return "internlm" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        model = model.eval()
+        if "8k" in model_path.lower():
+            model.config.max_sequence_length = 8192
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("internlm-chat")
+class StarChatAdapter(BaseModelAdapter):
+    """The model adapter for HuggingFaceH4/starchat-beta"""
+    def match(self, model_path: str):
+        return "starchat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("starchat")
+class MistralAdapter(BaseModelAdapter):
+    """The model adapter for Mistral AI models"""
+    def match(self, model_path: str):
+        return "mistral" in model_path.lower() or "mixtral" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("mistral")
+class Llama2Adapter(BaseModelAdapter):
+    """The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
+    def match(self, model_path: str):
+        return "llama-2" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama-2")
+class Llama3Adapter(BaseModelAdapter):
+    """The model adapter for Llama-3 (e.g., meta-llama/Meta-Llama-3-8B-Instruct)"""
+    def match(self, model_path: str):
+        return "llama-3" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama-3")
+class CuteGPTAdapter(BaseModelAdapter):
+    """The model adapter for CuteGPT"""
+    def match(self, model_path: str):
+        return "cutegpt" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        tokenizer = LlamaTokenizer.from_pretrained(model_path)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
+        )
+        tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<end>")
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.eos_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("cutegpt")
+class OpenOrcaAdapter(BaseModelAdapter):
+    """Model adapter for Open-Orca models which may use different prompt templates
+    - (e.g. Open-Orca/OpenOrcaxOpenChat-Preview2-13B, Open-Orca/Mistral-7B-OpenOrca)
+    - `OpenOrcaxOpenChat-Preview2-13B` uses their "OpenChat Llama2 V1" prompt template.
+        - [Open-Orca/OpenOrcaxOpenChat-Preview2-13B #Prompt Template](https://huggingface.co/Open-Orca/OpenOrcaxOpenChat-Preview2-13B#prompt-template)
+    - `Mistral-7B-OpenOrca` uses the [OpenAI's Chat Markup Language (ChatML)](https://github.com/openai/openai-python/blob/main/chatml.md)
+        format, with <|im_start|> and <|im_end|> tokens added to support this.
+        - [Open-Orca/Mistral-7B-OpenOrca #Prompt Template](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template)
+    """
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return (
+            "mistral-7b-openorca" in model_path.lower()
+            or "openorca" in model_path.lower()
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "mistral-7b-openorca" in model_path.lower():
+            return get_conv_template("mistral-7b-openorca")
+        return get_conv_template("open-orca")
+class DolphinAdapter(OpenOrcaAdapter):
+    """Model adapter for ehartford/dolphin-2.2.1-mistral-7b"""
+    def match(self, model_path: str):
+        return "dolphin" in model_path.lower() and "mistral" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("dolphin-2.2.1-mistral-7b")
+class Hermes2Adapter(BaseModelAdapter):
+    """Model adapter for teknium/OpenHermes-2.5-Mistral-7B and teknium/OpenHermes-2-Mistral-7B models"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return any(
+            model_str in model_path.lower()
+            for model_str in ["openhermes-2.5-mistral-7b", "openhermes-2-mistral-7b"]
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, use_fast=self.use_fast_tokenizer, revision=revision
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("OpenHermes-2.5-Mistral-7B")
+class NousHermes2MixtralAdapter(BaseModelAdapter):
+    """Model adapter for NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO model"""
+    def match(self, model_path: str):
+        return any(
+            model_str in model_path.lower()
+            for model_str in [
+                "nous-hermes-2-mixtral-8x7b-dpo",
+                "nous-hermes-2-mixtral-8x7b-sft",
+            ]
+        )
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("Nous-Hermes-2-Mixtral-8x7B-DPO")
+class WizardCoderAdapter(BaseModelAdapter):
+    """The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "wizardcoder" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        # Same as Alpaca, see :
+        # https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/inference_wizardcoder.py#L60
+        return get_conv_template("alpaca")
+class QwenChatAdapter(BaseModelAdapter):
+    """The model adapter for Qwen/Qwen-7B-Chat
+    To run this model, you need to ensure additional flash attention installation:
+    ``` bash
+    git clone https://github.com/Dao-AILab/flash-attention
+    cd flash-attention && pip install .
+    pip install csrc/layer_norm
+    pip install csrc/rotary
+    ```
+    Since from 2.0, the following change happened
+    - `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
+    - `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
+    - `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
+    You may need to revise the code in: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L69
+    to from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
+    """
+    def match(self, model_path: str):
+        return "qwen" in model_path.lower()
+    def float_set(self, config, option):
+        config.bf16 = False
+        config.fp16 = False
+        config.fp32 = False
+        if option == "bf16":
+            config.bf16 = True
+        elif option == "fp16":
+            config.fp16 = True
+        elif option == "fp32":
+            config.fp32 = True
+        else:
+            print("Invalid option. Please choose one from 'bf16', 'fp16' and 'fp32'.")
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        from transformers.generation import GenerationConfig
+        revision = from_pretrained_kwargs.get("revision", "main")
+        config = AutoConfig.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+        )
+        # NOTE: if you use the old version of model file, please remove the comments below
+        # config.use_flash_attn = False
+        self.float_set(config, "fp16")
+        generation_config = GenerationConfig.from_pretrained(
+            model_path, trust_remote_code=True
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            config=config,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
+            model.config.max_sequence_length = 16384
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        tokenizer.eos_token_id = config.eos_token_id
+        tokenizer.bos_token_id = config.bos_token_id
+        tokenizer.pad_token_id = generation_config.pad_token_id
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.bos_token_id = tokenizer.bos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("qwen-7b-chat")
+class SmaugChatAdapter(BaseModelAdapter):
+    """The model adapter for abacusai/Smaug-2-72B."""
+    def match(self, model_path: str):
+        return "smaug" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("qwen-7b-chat")
+class BGEAdapter(BaseModelAdapter):
+    """The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "bge" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModel.from_pretrained(
+            model_path,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        if hasattr(model.config, "max_position_embeddings") and hasattr(
+            tokenizer, "model_max_length"
+        ):
+            model.config.max_sequence_length = min(
+                model.config.max_position_embeddings, tokenizer.model_max_length
+            )
+        model.use_cls_pooling = True
+        model.eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("one_shot")
+class E5Adapter(BaseModelAdapter):
+    """The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "e5-" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModel.from_pretrained(
+            model_path,
+            **from_pretrained_kwargs,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        if hasattr(model.config, "max_position_embeddings") and hasattr(
+            tokenizer, "model_max_length"
+        ):
+            model.config.max_sequence_length = min(
+                model.config.max_position_embeddings, tokenizer.model_max_length
+            )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("one_shot")
+class AquilaChatAdapter(BaseModelAdapter):
+    """The model adapter for BAAI/Aquila
+    Now supports:
+    - BAAI/AquilaChat-7B
+    - BAAI/AquilaChat2-7B
+    - BAAI/AquilaChat2-34B
+    """
+    def match(self, model_path: str):
+        return "aquila" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        model = model.eval()
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=True, revision=revision
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        # See: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L347
+        if "aquilachat2" in model_path:
+            if "16k" in model_path:
+                return get_conv_template("aquila")
+            elif "34b" in model_path:
+                return get_conv_template("aquila-legacy")
+            else:
+                return get_conv_template("aquila-v1")
+        else:
+            return get_conv_template("aquila-chat")
+class Lamma2ChineseAdapter(BaseModelAdapter):
+    """The model adapter for FlagAlpha/LLama2-Chinese sft"""
+    def match(self, model_path: str):
+        return "llama2-chinese" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama2-chinese")
+class Lamma2ChineseAlpacaAdapter(BaseModelAdapter):
+    """The model adapter for ymcui/Chinese-LLaMA-Alpaca sft"""
+    def match(self, model_path: str):
+        return "chinese-alpaca" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("chinese-alpaca2")
+class VigogneAdapter(BaseModelAdapter):
+    """The model adapter for vigogne (e.g., bofenghuang/vigogne-2-7b-chat)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return bool(re.search(r"vigogne|vigostral", model_path, re.I))
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            use_fast=self.use_fast_tokenizer,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        if "chat" in model_path.lower():
+            if "vigostral" in model_path.lower():
+                return get_conv_template("vigogne_chat_v3")
+            return get_conv_template("vigogne_chat_v2")
+        return get_conv_template("vigogne_instruct")
+class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
+    """The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return (
+            "open-llama" in model_path.lower() and "open-instruct" in model_path.lower()
+        )
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            use_fast=self.use_fast_tokenizer,
+            trust_remote_code=True,
+            revision=revision,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True,
+            **from_pretrained_kwargs,
+        ).eval()
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("alpaca")
+class CodeLlamaAdapter(BaseModelAdapter):
+    """The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
+    def match(self, model_path: str):
+        return "codellama" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("llama-2")
+class StableVicunaAdapter(BaseModelAdapter):
+    """The model adapter for StableVicuna"""
+    def match(self, model_path: str):
+        return "stable-vicuna" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        model.config.eos_token_id = tokenizer.eos_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("stable-vicuna")
+class PhindCodeLlamaAdapter(CodeLlamaAdapter):
+    """The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
+    def match(self, model_path: str):
+        return "phind-codellama-" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("phind")
+class Llama2ChangAdapter(Llama2Adapter):
+    """The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""
+    def match(self, model_path: str):
+        return "llama2-ko-chang" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("polyglot_changgpt")
+class ZephyrAdapter(BaseModelAdapter):
+    """The model adapter for Zephyr (e.g. HuggingFaceH4/zephyr-7b-alpha)"""
+    def match(self, model_path: str):
+        return "zephyr" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("zephyr")
+class NotusAdapter(BaseModelAdapter):
+    """The model adapter for Notus (e.g. argilla/notus-7b-v1)"""
+    def match(self, model_path: str):
+        return "notus" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("zephyr")
+class CatPPTAdapter(BaseModelAdapter):
+    """The model adapter for CatPPT (e.g. rishiraj/CatPPT)"""
+    def match(self, model_path: str):
+        return "catppt" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("catppt")
+class TinyLlamaAdapter(BaseModelAdapter):
+    """The model adapter for TinyLlama (e.g. TinyLlama/TinyLlama-1.1B-Chat-v1.0)"""
+    def match(self, model_path: str):
+        return "tinyllama" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("TinyLlama")
+class XwinLMAdapter(BaseModelAdapter):
+    """The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)"""
+    # use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "xwin-lm" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("vicuna_v1.1")
+class LemurAdapter(BaseModelAdapter):
+    """The model adapter for OpenLemur/lemur-70b-chat-v1"""
+    use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return "lemur-70b-chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("lemur-70b-chat")
+class PygmalionAdapter(BaseModelAdapter):
+    """The model adapter for Pygmalion/Metharme series of models(e.g., PygmalionAI/mythalion-13b)"""
+    # use_fast_tokenizer = False
+    def match(self, model_path: str):
+        return bool(
+            re.search(r"pygmalion|mythalion|metharme", model_path.lower(), re.I)
+        )
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("metharme")
+class XdanAdapter(BaseModelAdapter):
+    """The model adapter for xDAN-AI (e.g. xDAN-AI/xDAN-L1-Chat-RL-v1)"""
+    def match(self, model_path: str):
+        return "xdan" in model_path.lower() and "v1" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("xdan-v1")
+class MicrosoftOrcaAdapter(BaseModelAdapter):
+    """The model adapter for Microsoft/Orca-2 series of models (e.g. Microsoft/Orca-2-7b, Microsoft/Orca-2-13b)"""
+    use_fast_tokenizer = False  # Flag neeeded since tokenizers>=0.13.3 is required for a normal functioning of this module
+    def match(self, model_path: str):
+        return "orca-2" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("orca-2")
+class YiAdapter(BaseModelAdapter):
+    """The model adapter for Yi models"""
+    def match(self, model_path: str):
+        return "yi-" in model_path.lower() and "chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("Yi-34b-chat")
+class DeepseekCoderAdapter(BaseModelAdapter):
+    """The model adapter for deepseek-ai's coder models"""
+    def match(self, model_path: str):
+        return "deepseek-coder" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("deepseek-coder")
+class DeepseekChatAdapter(BaseModelAdapter):
+    """The model adapter for deepseek-ai's chat models"""
+    # Note: that this model will require tokenizer version >= 0.13.3 because the tokenizer class is LlamaTokenizerFast
+    def match(self, model_path: str):
+        return "deepseek-llm" in model_path.lower() and "chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("deepseek-chat")
+class Yuan2Adapter(BaseModelAdapter):
+    """The model adapter for Yuan2.0"""
+    def match(self, model_path: str):
+        return "yuan2" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        revision = from_pretrained_kwargs.get("revision", "main")
+        # from_pretrained_kwargs["torch_dtype"] = torch.bfloat16
+        tokenizer = LlamaTokenizer.from_pretrained(
+            model_path,
+            add_eos_token=False,
+            add_bos_token=False,
+            eos_token="<eod>",
+            eod_token="<eod>",
+            sep_token="<sep>",
+            revision=revision,
+        )
+        tokenizer.add_tokens(
+            [
+                "<sep>",
+                "<pad>",
+                "<mask>",
+                "<predict>",
+                "<FIM_SUFFIX>",
+                "<FIM_PREFIX>",
+                "<FIM_MIDDLE>",
+                "<commit_before>",
+                "<commit_msg>",
+                "<commit_after>",
+                "<jupyter_start>",
+                "<jupyter_text>",
+                "<jupyter_code>",
+                "<jupyter_output>",
+                "<empty_output>",
+            ],
+            special_tokens=True,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            # device_map='auto',
+            trust_remote_code=True,
+            **from_pretrained_kwargs,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("yuan2")
+class MetaMathAdapter(BaseModelAdapter):
+    """The model adapter for MetaMath models"""
+    def match(self, model_path: str):
+        return "metamath" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("metamath")
+class BagelAdapter(BaseModelAdapter):
+    """Model adapter for jondurbin/bagel-* models"""
+    def match(self, model_path: str):
+        return "bagel" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("airoboros_v3")
+class SolarAdapter(BaseModelAdapter):
+    """The model adapter for upstage/SOLAR-10.7B-Instruct-v1.0"""
+    def match(self, model_path: str):
+        return "solar-" in model_path.lower() and "instruct" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("solar")
+class SteerLMAdapter(BaseModelAdapter):
+    """The model adapter for nvidia/Llama2-70B-SteerLM-Chat"""
+    def match(self, model_path: str):
+        return "steerlm-chat" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("steerlm")
+class GemmaAdapter(BaseModelAdapter):
+    """The model adapter for google/gemma"""
+    def match(self, model_path: str):
+        return "gemma" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("gemma")
+class LlavaAdapter(BaseModelAdapter):
+    """The model adapter for liuhaotian/llava-v1.5 series of models"""
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        # TODO(chris): Implement huggingface-compatible load_model
+        pass
+    def match(self, model_path: str):
+        return "llava" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        model_path = model_path.lower()
+        if "34b" in model_path:
+            return get_conv_template("llava-chatml")
+        return get_conv_template("vicuna_v1.1")
+class YuanAdapter(BaseModelAdapter):
+    """The model adapter for Yuan"""
+    def match(self, model_path: str):
+        return "yuan" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
+        tokenizer.add_tokens(
+            [
+                "<sep>",
+                "<pad>",
+                "<mask>",
+                "<predict>",
+                "<FIM_SUFFIX>",
+                "<FIM_PREFIX>",
+                "<FIM_MIDDLE>",
+                "<commit_before>",
+                "<commit_msg>",
+                "<commit_after>",
+                "<jupyter_start>",
+                "<jupyter_text>",
+                "<jupyter_code>",
+                "<jupyter_output>",
+                "<empty_output>",
+            ],
+            special_tokens=True,
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("yuan")
+class OlmoAdapter(BaseModelAdapter):
+    """The model adapter for allenai/OLMo-7B-Instruct"""
+    def match(self, model_path: str):
+        return "olmo" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("api_based_default")
+class YandexGPTAdapter(BaseModelAdapter):
+    """The model adapter for YandexGPT"""
+    def match(self, model_path: str):
+        return "yandexgpt" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("yandexgpt")
+class CllmAdapter(BaseModelAdapter):
+    """The model adapter for CLLM"""
+    def match(self, model_path: str):
+        return "consistency-llm" in model_path.lower()
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        config = AutoConfig.from_pretrained(
+            model_path,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_path,
+            model_max_length=2048,
+            padding_side="right",
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            config=config,
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            device_map="cuda",
+        )
+        return model, tokenizer
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("cllm")
+class CohereAdapter(BaseModelAdapter):
+    """The model adapter for Cohere"""
+    def match(self, model_path: str):
+        return model_path in ["command-r"]
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("api_based_default")
+class DBRXAdapter(BaseModelAdapter):
+    """The model adapter for Cohere"""
+    def match(self, model_path: str):
+        return model_path in ["dbrx-instruct"]
+    def load_model(self, model_path: str, from_pretrained_kwargs: dict):
+        raise NotImplementedError()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("api_based_default")
+class RekaAdapter(BaseModelAdapter):
+    """The model adapter for Reka"""
+    def match(self, model_path: str):
+        return "reka" in model_path.lower()
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("api_based_default")
+# Note: the registration order matters.
+# The one registered earlier has a higher matching priority.
+register_model_adapter(PeftModelAdapter)
+register_model_adapter(StableVicunaAdapter)
+register_model_adapter(VicunaAdapter)
+register_model_adapter(AiroborosAdapter)
+register_model_adapter(LongChatAdapter)
+register_model_adapter(GoogleT5Adapter)
+register_model_adapter(KoalaAdapter)
+register_model_adapter(AlpacaAdapter)
+register_model_adapter(ChatGLMAdapter)
+register_model_adapter(CodeGeexAdapter)
+register_model_adapter(DollyV2Adapter)
+register_model_adapter(OasstPythiaAdapter)
+register_model_adapter(OasstLLaMAAdapter)
+register_model_adapter(OpenChat35Adapter)
+register_model_adapter(TenyxChatAdapter)
+register_model_adapter(StableLMAdapter)
+register_model_adapter(BaizeAdapter)
+register_model_adapter(RwkvAdapter)
+register_model_adapter(OpenBuddyAdapter)
+register_model_adapter(PhoenixAdapter)
+register_model_adapter(BardAdapter)
+register_model_adapter(PaLM2Adapter)
+register_model_adapter(GeminiAdapter)
+register_model_adapter(GeminiDevAdapter)
+register_model_adapter(GemmaAdapter)
+register_model_adapter(ChatGPTAdapter)
+register_model_adapter(AzureOpenAIAdapter)
+register_model_adapter(ClaudeAdapter)
+register_model_adapter(MPTAdapter)
+register_model_adapter(BiLLaAdapter)
+register_model_adapter(RedPajamaINCITEAdapter)
+register_model_adapter(H2OGPTAdapter)
+register_model_adapter(RobinAdapter)
+register_model_adapter(SnoozyAdapter)
+register_model_adapter(WizardLMAdapter)
+register_model_adapter(ManticoreAdapter)
+register_model_adapter(GuanacoAdapter)
+register_model_adapter(CamelAdapter)
+register_model_adapter(ChangGPTAdapter)
+register_model_adapter(TuluAdapter)
+register_model_adapter(FalconChatAdapter)
+register_model_adapter(FalconAdapter)
+register_model_adapter(TigerBotAdapter)
+register_model_adapter(BaichuanAdapter)
+register_model_adapter(XGenAdapter)
+register_model_adapter(PythiaAdapter)
+register_model_adapter(InternLMChatAdapter)
+register_model_adapter(StarChatAdapter)
+register_model_adapter(Llama2Adapter)
+register_model_adapter(Llama3Adapter)
+register_model_adapter(CuteGPTAdapter)
+register_model_adapter(OpenOrcaAdapter)
+register_model_adapter(DolphinAdapter)
+register_model_adapter(Hermes2Adapter)
+register_model_adapter(NousHermes2MixtralAdapter)
+register_model_adapter(NousHermesAdapter)
+register_model_adapter(MistralAdapter)
+register_model_adapter(WizardCoderAdapter)
+register_model_adapter(QwenChatAdapter)
+register_model_adapter(AquilaChatAdapter)
+register_model_adapter(BGEAdapter)
+register_model_adapter(E5Adapter)
+register_model_adapter(Lamma2ChineseAdapter)
+register_model_adapter(Lamma2ChineseAlpacaAdapter)
+register_model_adapter(VigogneAdapter)
+register_model_adapter(OpenLLaMaOpenInstructAdapter)
+register_model_adapter(ReaLMAdapter)
+register_model_adapter(PhindCodeLlamaAdapter)
+register_model_adapter(CodeLlamaAdapter)
+register_model_adapter(Llama2ChangAdapter)
+register_model_adapter(ZephyrAdapter)
+register_model_adapter(NotusAdapter)
+register_model_adapter(CatPPTAdapter)
+register_model_adapter(TinyLlamaAdapter)
+register_model_adapter(XwinLMAdapter)
+register_model_adapter(LemurAdapter)
+register_model_adapter(PygmalionAdapter)
+register_model_adapter(MicrosoftOrcaAdapter)
+register_model_adapter(XdanAdapter)
+register_model_adapter(YiAdapter)
+register_model_adapter(PplxAIAdapter)
+register_model_adapter(DeepseekCoderAdapter)
+register_model_adapter(DeepseekChatAdapter)
+register_model_adapter(Yuan2Adapter)
+register_model_adapter(MetaMathAdapter)
+register_model_adapter(BagelAdapter)
+register_model_adapter(SolarAdapter)
+register_model_adapter(SteerLMAdapter)
+register_model_adapter(LlavaAdapter)
+register_model_adapter(YuanAdapter)
+register_model_adapter(OlmoAdapter)
+register_model_adapter(CohereAdapter)
+register_model_adapter(DBRXAdapter)
+register_model_adapter(GemmaAdapter)
+register_model_adapter(YandexGPTAdapter)
+register_model_adapter(CllmAdapter)
+register_model_adapter(RekaAdapter)
+register_model_adapter(SmaugChatAdapter)
+# After all adapters, try the default base adapter.
+register_model_adapter(BaseModelAdapter)

src/model/model_chatglm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+Inference code for ChatGLM.
+Adapted from https://huggingface.co/THUDM/chatglm-6b/blob/main/modeling_chatglm.py.
+"""
+import re
+import torch
+from transformers.generation.logits_process import LogitsProcessor
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(
+        self, input_ids: torch.LongTensor, scores: torch.FloatTensor
+    ) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+invalid_score_processor = InvalidScoreLogitsProcessor()
+def process_response(response):
+    response = response.strip()
+    response = response.replace("[[训练时间]]", "2023年")
+    punkts = [
+        [",", "，"],
+        ["!", "！"],
+        [":", "："],
+        [";", "；"],
+        ["\?", "？"],
+    ]
+    for item in punkts:
+        response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response)
+        response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response)
+    return response
+def recover_message_list(prompt):
+    role_token_pattern = "|".join(
+        [re.escape(r) for r in ["<|system|>", "<|user|>", "<|assistant|>"]]
+    )
+    role = None
+    last_end_idx = -1
+    message_list = []
+    for match in re.finditer(role_token_pattern, prompt):
+        if role:
+            messge = {}
+            if role == "<|system|>":
+                messge["role"] = "system"
+            elif role == "<|user|>":
+                messge["role"] = "user"
+            else:
+                messge["role"] = "assistant"
+            messge["content"] = prompt[last_end_idx + 1 : match.start()]
+            message_list.append(messge)
+        role = prompt[match.start() : match.end()]
+        last_end_idx = match.end()
+    return message_list
+@torch.inference_mode()
+def generate_stream_chatglm(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    echo = params.get("echo", True)
+    model_type = str(type(model)).lower()
+    if "peft" in model_type:
+        model_type = str(type(model.base_model.model)).lower()
+    if "chatglm3" in model_type:
+        message_list = recover_message_list(prompt)
+        inputs = tokenizer.build_chat_input(
+            query=message_list[-1]["content"], history=message_list[:-1], role="user"
+        ).to(model.device)
+    else:
+        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
+    input_echo_len = len(inputs["input_ids"][0])
+    gen_kwargs = {
+        "max_length": max_new_tokens + input_echo_len,
+        "do_sample": True if temperature > 1e-5 else False,
+        "top_p": top_p,
+        "repetition_penalty": repetition_penalty,
+        "logits_processor": [invalid_score_processor],
+    }
+    if temperature > 1e-5:
+        gen_kwargs["temperature"] = temperature
+    total_len = 0
+    for total_ids in model.stream_generate(**inputs, **gen_kwargs):
+        total_ids = total_ids.tolist()[0]
+        total_len = len(total_ids)
+        if echo:
+            output_ids = total_ids
+        else:
+            output_ids = total_ids[input_echo_len:]
+        response = tokenizer.decode(output_ids)
+        response = process_response(response)
+        yield {
+            "text": response,
+            "usage": {
+                "prompt_tokens": input_echo_len,
+                "completion_tokens": total_len - input_echo_len,
+                "total_tokens": total_len,
+            },
+            "finish_reason": None,
+        }
+    # TODO: ChatGLM stop when it reach max length
+    # Only last stream result contains finish_reason, we set finish_reason as stop
+    ret = {
+        "text": response,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": total_len - input_echo_len,
+            "total_tokens": total_len,
+        },
+        "finish_reason": "stop",
+    }
+    yield ret

src/model/model_cllm.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import torch
+import gc
+import os
+import time
+import random
+from typing import Dict, Optional, Sequence, List, Tuple
+from transformers.cache_utils import Cache, DynamicCache
+from transformers import (
+    LlamaModel,
+    LlamaForCausalLM,
+    GenerationConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+)
+from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+import torch.nn.functional as F
+def get_jacobian_trajectory(
+    model, tokenizer, input_ids, attention_mask, max_new_tokens
+):
+    bsz = input_ids.shape[0]
+    prompt_len = [torch.sum(t) for t in attention_mask]
+    max_prompt_len = max(prompt_len)
+    total_len = max_prompt_len + max_new_tokens
+    # initialize the first point of jacobian trajectory
+    tokens = torch.full(
+        (bsz, total_len), tokenizer.pad_token_id, dtype=torch.long, device=model.device
+    )
+    for i in range(bsz):
+        tokens[i, :] = torch.tensor(
+            random.choices(input_ids[i][attention_mask[i] == 1], k=total_len),
+            dtype=torch.long,
+            device=model.device,
+        )
+        tokens[i, : prompt_len[i]] = input_ids[i][: prompt_len[i]].to(
+            dtype=torch.long, device=model.device
+        )
+    itr = 0
+    next_generation = tokens
+    generate_attention_mask = torch.full_like(next_generation, 1).to(model.device)
+    accurate_lengths = torch.tensor([prompt_len[i].item()] * bsz, device=model.device)
+    prev_len = 0
+    while True:
+        current_generation = next_generation
+        with torch.no_grad():
+            logits = model(current_generation, generate_attention_mask).logits
+        next_generation = torch.argmax(
+            torch.nn.functional.softmax(logits, dim=-1) / 0.001, dim=-1
+        )
+        # hold prompt unchanged and update generated tokens
+        for i in range(bsz):
+            next_generation[i, :] = torch.cat(
+                (
+                    tokens[i, : prompt_len[i]],
+                    next_generation[i, prompt_len[i] - 1 : total_len - 1],
+                ),
+                dim=0,
+            )
+        if (
+            torch.all(torch.eq(next_generation, current_generation)).item()
+            and itr == max_new_tokens
+            or len(
+                torch.where(
+                    current_generation[0, : accurate_lengths[0]]
+                    == tokenizer.eos_token_id
+                )[0]
+            )
+            > 0
+        ):
+            # forced exit due to max_new_tokens constraint or eos reached
+            return next_generation, itr
+        # skip the first itr, current_generation has not been updated yet
+        if itr != 0:
+            if torch.all(torch.eq(next_generation, current_generation)).item():
+                matched_position = total_len
+            else:
+                matched_position = (
+                    torch.eq(current_generation, next_generation).squeeze(0) == False
+                ).nonzero(as_tuple=True)[0][0]
+            fast_forward_cnt = matched_position - accurate_lengths[0]
+            for i in range(bsz):
+                accurate_lengths[i] = matched_position.item()
+            # flush and print the first sequence
+            generated_str = tokenizer.decode(
+                next_generation[0, prompt_len[0] : accurate_lengths[0]],
+                skip_special_tokens=True,
+                spaces_between_special_tokens=False,
+                clean_up_tokenization_spaces=True,
+            )
+            print(generated_str[prev_len:], flush=True, end="")
+            prev_len = len(generated_str)
+            if torch.all(torch.eq(next_generation, current_generation)).item():
+                # early termination: itr < max_new_tokens
+                return next_generation, itr
+        itr += 1
+def generate_stream_cllm(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    # converge_step = []
+    prompt = params["prompt"]
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    max_new_tokens = int(params.get("n_token_seq_length", 32))
+    max_new_seq_len = int(params.get("max_new_tokens", 1024))
+    prompt_len = torch.sum(inputs["attention_mask"], dim=-1)
+    generation = inputs["input_ids"]
+    input_echo_len = len(generation)
+    ### generation phase
+    itr = 0
+    eos_reached = False
+    while True:
+        if itr == 0:
+            input_ids = inputs["input_ids"]
+            input_masks = inputs["attention_mask"]
+        else:
+            input_masks = torch.ones_like(input_ids).to(device)
+            for j in range(bsz):
+                input_masks[j][
+                    torch.sum(inputs["attention_mask"], dim=-1)[j]
+                    + itr * max_new_tokens :
+                ] = 0
+        bsz = input_ids.shape[0]
+        eos_reached = torch.tensor([False] * bsz, device=device)
+        generation, iter_steps = get_jacobian_trajectory(
+            model=model,
+            tokenizer=tokenizer,
+            input_ids=input_ids,
+            attention_mask=input_masks,
+            max_new_tokens=max_new_tokens,
+        )
+        ### inspect <eos>
+        for j in range(bsz):
+            prompt_len = torch.sum(input_masks, dim=-1)
+            eos_positions = torch.where(generation[j] == tokenizer.eos_token_id)[0]
+            if len(eos_positions) == 0:
+                # no EOS, continue to the next item in the batch
+                generation[j][prompt_len[j] + max_new_tokens :] = tokenizer.pad_token_id
+                continue
+            # otherwise, set tokens coming after EOS as pad
+            else:
+                if len(eos_positions) != 0:
+                    eos_reached[j] = True
+                    generation[j, int(eos_positions[0]) + 1 :] = tokenizer.pad_token_id
+        itr += 1
+        if all(eos_reached) or itr * max_new_tokens >= max_new_seq_len:
+            break
+        input_ids = generation[
+            torch.where(eos_reached == False)[0].tolist(), ...
+        ]  # delete samples with <eos> generated
+    if all(eos_reached):
+        finish_reason = "eos"
+    elif itr * max_new_tokens > max_new_seq_len:
+        finish_reason = "length"
+    else:
+        finish_reason = "stop"
+    output = tokenizer.decode(input_ids[0], skip_special_tokens=False)
+    yield {
+        "text": "",
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": itr * max_new_tokens,
+            "total_tokens": input_echo_len + itr * max_new_tokens,
+        },
+        "finish_reason": finish_reason,
+    }
+    # clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()

src/model/model_codet5p.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import gc
+from threading import Thread
+import torch
+import transformers
+from transformers import (
+    GenerationConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+)
+@torch.inference_mode()
+def generate_stream_codet5p(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", 50))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 1024))
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    stop_token_ids.append(tokenizer.eos_token_id)
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, **decode_config)
+    encoding = tokenizer(prompt, return_tensors="pt").to(device)
+    input_ids = encoding.input_ids
+    encoding["decoder_input_ids"] = encoding["input_ids"].clone()
+    input_echo_len = len(input_ids)
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature >= 1e-5,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=10,
+        top_p=top_p,
+        top_k=top_k,
+        eos_token_id=stop_token_ids,
+    )
+    class CodeBlockStopper(StoppingCriteria):
+        def __call__(
+            self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+        ) -> bool:
+            # Code-completion is open-end generation.
+            # We check \n\n to stop at end of a code block.
+            if list(input_ids[0][-2:]) == [628, 198]:
+                return True
+            return False
+    gen_kwargs = dict(
+        **encoding,
+        streamer=streamer,
+        generation_config=generation_config,
+        stopping_criteria=StoppingCriteriaList([CodeBlockStopper()]),
+    )
+    thread = Thread(target=model.generate, kwargs=gen_kwargs)
+    thread.start()
+    i = 0
+    output = ""
+    for new_text in streamer:
+        i += 1
+        output += new_text
+        if i % stream_interval == 0 or i == max_new_tokens - 1:
+            yield {
+                "text": output,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": i,
+                    "total_tokens": input_echo_len + i,
+                },
+                "finish_reason": None,
+            }
+        if i >= max_new_tokens:
+            break
+    if i >= max_new_tokens:
+        finish_reason = "length"
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    thread.join()
+    # clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()

src/model/model_exllama.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import gc
+import sys
+from typing import Dict
+import torch
+def generate_stream_exllama(
+    model,
+    tokenizer,
+    params: Dict,
+    device: str,
+    context_len: int,
+    stream_interval: int = 2,
+    judge_sent_end: bool = False,
+):
+    try:
+        from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler
+    except ImportError as e:
+        print(f"Error: Failed to load Exllamav2. {e}")
+        sys.exit(-1)
+    prompt = params["prompt"]
+    generator = ExLlamaV2StreamingGenerator(model.model, model.cache, tokenizer)
+    settings = ExLlamaV2Sampler.Settings()
+    settings.temperature = float(params.get("temperature", 0.85))
+    settings.top_k = int(params.get("top_k", 50))
+    settings.top_p = float(params.get("top_p", 0.8))
+    settings.token_repetition_penalty = float(params.get("repetition_penalty", 1.15))
+    settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    generator.set_stop_conditions(params.get("stop_token_ids", None) or [])
+    echo = bool(params.get("echo", True))
+    input_ids = generator.tokenizer.encode(prompt)
+    prompt_tokens = input_ids.shape[-1]
+    generator.begin_stream(input_ids, settings)
+    generated_tokens = 0
+    if echo:
+        output = prompt
+    else:
+        output = ""
+    while True:
+        chunk, eos, _ = generator.stream()
+        output += chunk
+        generated_tokens += 1
+        if generated_tokens == max_new_tokens:
+            finish_reason = "length"
+            break
+        elif eos:
+            finish_reason = "length"
+            break
+        yield {
+            "text": output,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": generated_tokens,
+                "total_tokens": prompt_tokens + generated_tokens,
+            },
+            "finish_reason": None,
+        }
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": generated_tokens,
+            "total_tokens": prompt_tokens + generated_tokens,
+        },
+        "finish_reason": finish_reason,
+    }
+    gc.collect()

src/model/model_falcon.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gc
+from threading import Thread
+from typing import Iterable
+import torch
+import transformers
+from transformers import TextIteratorStreamer, GenerationConfig
+from fastchat.utils import is_partial_stop
+@torch.inference_mode()
+def generate_stream_falcon(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    len_prompt = len(prompt)
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", 50))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    stop_str = params.get("stop", None)
+    echo = bool(params.get("echo", True))
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    stop_token_ids.append(tokenizer.eos_token_id)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    max_src_len = context_len - max_new_tokens - 8
+    input_ids = input_ids[-max_src_len:]  # truncate from the left
+    attention_mask = attention_mask[-max_src_len:]  # truncate from the left
+    input_echo_len = len(input_ids)
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature >= 1e-5,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=10,
+        top_p=top_p,
+        top_k=top_k,
+        eos_token_id=stop_token_ids,
+    )
+    generation_kwargs = dict(
+        inputs=input_ids,
+        attention_mask=attention_mask,
+        streamer=streamer,
+        generation_config=generation_config,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    if echo:
+        # means keep the prompt
+        output = prompt
+    else:
+        output = ""
+    for i, new_text in enumerate(streamer):
+        output += new_text
+        if i % stream_interval == 0:
+            if echo:
+                rfind_start = len_prompt
+            else:
+                rfind_start = 0
+            partially_stopped = False
+            if stop_str:
+                if isinstance(stop_str, str):
+                    pos = output.rfind(stop_str, rfind_start)
+                    if pos != -1:
+                        output = output[:pos]
+                    else:
+                        partially_stopped = is_partial_stop(output, stop_str)
+                elif isinstance(stop_str, Iterable):
+                    for each_stop in stop_str:
+                        pos = output.rfind(each_stop, rfind_start)
+                        if pos != -1:
+                            output = output[:pos]
+                            break
+                        else:
+                            partially_stopped = is_partial_stop(output, each_stop)
+                            if partially_stopped:
+                                break
+                else:
+                    raise ValueError("Invalid stop field type.")
+            # prevent yielding partial stop sequence
+            if not partially_stopped:
+                yield {
+                    "text": output,
+                    "usage": {
+                        "prompt_tokens": input_echo_len,
+                        "completion_tokens": i,
+                        "total_tokens": input_echo_len + i,
+                    },
+                    "finish_reason": None,
+                }
+    output = output.strip()
+    # finish stream event, which contains finish reason
+    if i == max_new_tokens - 1:
+        finish_reason = "length"
+    elif partially_stopped:
+        finish_reason = None
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    # clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()

src/model/model_registry.py ADDED Viewed

	@@ -0,0 +1,764 @@

+"""Additional information of the models."""
+from collections import namedtuple, OrderedDict
+from typing import List
+ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"])
+model_info = OrderedDict()
+def register_model_info(
+    full_names: List[str], simple_name: str, link: str, description: str
+):
+    info = ModelInfo(simple_name, link, description)
+    for full_name in full_names:
+        model_info[full_name] = info
+def get_model_info(name: str) -> ModelInfo:
+    if name in model_info:
+        return model_info[name]
+    else:
+        # To fix this, please use `register_model_info` to register your model
+        return ModelInfo(
+            name, "", "Register the description at fastchat/model/model_registry.py"
+        )
+register_model_info(
+    [
+        "IEITYuan/Yuan2-2B-Janus-hf",
+        "IEITYuan/Yuan2-2B-hf",
+        "IEITYuan/Yuan2-51B-hf",
+        "IEITYuan/Yuan2-102B-hf",
+    ],
+    "IEIT-Yuan2",
+    "https://github.com/IEIT-Yuan/Yuan-2.0",
+    "Yuan2.0 is a new generation Fundamental Large Language Model developed by IEIT System.",
+)
+register_model_info(
+    [
+        "claude-3-haiku-20240307",
+        "claude-3-sonnet-20240229",
+        "claude-3-opus-20240229",
+        "claude-2.1",
+        "claude-2.0",
+        "claude-1",
+    ],
+    "Claude",
+    "https://www.anthropic.com/news/claude-3-family",
+    "Claude by Anthropic",
+)
+register_model_info(
+    ["reka-flash", "reka-flash-online"],
+    "Reka Flash",
+    "https://www.reka.ai/news/reka-flash-efficient-and-capable-multimodal-language-models",
+    "Multimodal model by Reka",
+)
+register_model_info(
+    ["command-r-plus"],
+    "Command-R-Plus",
+    "https://txt.cohere.com/command-r-plus-microsoft-azure/",
+    "Command-R Plus by Cohere",
+)
+register_model_info(
+    ["command-r"],
+    "Command-R",
+    "https://txt.cohere.com/command-r/",
+    "Command-R by Cohere",
+)
+register_model_info(
+    [
+        "zephyr-orpo-141b-A35b-v0.1",
+    ],
+    "Zephyr 141B-A35B",
+    "https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
+    "ORPO fine-tuned of Mixtral-8x22B-v0.1",
+)
+register_model_info(
+    ["gemma-1.1-7b-it", "gemma-1.1-2b-it", "gemma-7b-it", "gemma-2b-it"],
+    "Gemma",
+    "https://blog.google/technology/developers/gemma-open-models/",
+    "Gemma by Google",
+)
+register_model_info(
+    [
+        "mixtral-8x7b-instruct-v0.1",
+        "mistral-large-2402",
+        "mistral-medium",
+        "mistral-next",
+        "mistral-7b-instruct-v0.2",
+        "mistral-7b-instruct",
+    ],
+    "Mixtral of experts",
+    "https://mistral.ai/news/mixtral-of-experts/",
+    "A Mixture-of-Experts model by Mistral AI",
+)
+register_model_info(
+    [
+        "qwen1.5-72b-chat",
+        "qwen1.5-32b-chat",
+        "qwen1.5-14b-chat",
+        "qwen1.5-7b-chat",
+        "qwen1.5-4b-chat",
+        "qwen1.5-1.8b-chat",
+        "qwen1.5-0.5b-chat",
+        "qwen-14b-chat",
+    ],
+    "Qwen 1.5",
+    "https://qwenlm.github.io/blog/qwen1.5/",
+    "A large language model by Alibaba Cloud",
+)
+register_model_info(
+    ["dbrx-instruct"],
+    "DBRX Instruct",
+    "https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm",
+    "DBRX by Databricks Mosaic AI",
+)
+register_model_info(
+    ["starling-lm-7b-beta", "starling-lm-7b-alpha"],
+    "Starling-LM-7B",
+    "https://starling.cs.berkeley.edu/",
+    "An open model trained using RLAIF by Berkeley",
+)
+register_model_info(
+    ["qwen-14b-chat"],
+    "Qwen",
+    "https://huggingface.co/Qwen",
+    "A large language model by Alibaba Cloud",
+)
+register_model_info(
+    ["bard-feb-2024", "bard-jan-24-gemini-pro"],
+    "Bard",
+    "https://bard.google.com/",
+    "Bard by Google",
+)
+register_model_info(
+    [
+        "gemini-pro",
+        "gemini-pro-dev-api",
+        "gemini-1.0-pro-vision",
+        "gemini-1.5-pro-preview-0409",
+    ],
+    "Gemini",
+    "https://blog.google/technology/ai/google-gemini-pro-imagen-duet-ai-update/",
+    "Gemini by Google",
+)
+register_model_info(
+    ["stripedhyena-nous-7b"],
+    "StripedHyena-Nous",
+    "https://huggingface.co/togethercomputer/StripedHyena-Nous-7B",
+    "A chat model developed by Together Research and Nous Research.",
+)
+register_model_info(
+    ["solar-10.7b-instruct-v1.0"],
+    "SOLAR-10.7B-Instruct",
+    "https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0",
+    "A model trained using depth up-scaling by Upstage AI",
+)
+register_model_info(
+    [
+        "gpt-4-turbo",
+        "gpt-4-turbo-2024-04-09",
+        "gpt-4-1106-preview",
+        "gpt-4-0125-preview",
+    ],
+    "GPT-4-Turbo",
+    "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+    "GPT-4-Turbo by OpenAI",
+)
+register_model_info(
+    ["gpt-4-turbo-browsing"],
+    "GPT-4-Turbo with browsing",
+    "https://platform.openai.com/docs/assistants/overview",
+    "GPT-4-Turbo with browsing by OpenAI",
+)
+register_model_info(
+    [
+        "gpt-3.5-turbo",
+        "gpt-3.5-turbo-0125",
+        "gpt-3.5-turbo-1106",
+        "gpt-3.5-turbo-0314",
+        "gpt-3.5-turbo-0613",
+    ],
+    "GPT-3.5",
+    "https://platform.openai.com/docs/models/gpt-3-5",
+    "GPT-3.5-Turbo by OpenAI",
+)
+register_model_info(
+    ["gpt-4", "gpt-4-0314", "gpt-4-0613"],
+    "GPT-4",
+    "https://openai.com/research/gpt-4",
+    "GPT-4 by OpenAI",
+)
+register_model_info(
+    ["claude-instant-1", "claude-instant-1.2"],
+    "Claude Instant",
+    "https://www.anthropic.com/index/introducing-claude",
+    "Claude Instant by Anthropic",
+)
+register_model_info(
+    ["llama-2-70b-chat", "llama-2-34b-chat", "llama-2-13b-chat", "llama-2-7b-chat"],
+    "Llama 2",
+    "https://ai.meta.com/llama/",
+    "Open foundation and fine-tuned chat models by Meta",
+)
+register_model_info(
+    ["olmo-7b-instruct"],
+    "OLMo-7B",
+    "https://huggingface.co/allenai/OLMo-7B-Instruct",
+    "OLMo by Allen AI",
+)
+register_model_info(
+    [
+        "vicuna-33b",
+        "vicuna-33b-v1.3",
+        "vicuna-13b",
+        "vicuna-13b-v1.5",
+        "vicuna-7b",
+        "vicuna-7b-v1.5",
+    ],
+    "Vicuna",
+    "https://lmsys.org/blog/2023-03-30-vicuna/",
+    "A chat assistant fine-tuned on user-shared conversations by LMSYS",
+)
+register_model_info(
+    ["yi-34b-chat", "yi-6b-chat"],
+    "Yi-Chat",
+    "https://huggingface.co/01-ai/Yi-34B-Chat",
+    "A large language model by 01 AI",
+)
+register_model_info(
+    [
+        "codellama-70b-instruct",
+        "codellama-34b-instruct",
+        "codellama-13b-instruct",
+        "codellama-7b-instruct",
+    ],
+    "Code Llama",
+    "https://ai.meta.com/blog/code-llama-large-language-model-coding/",
+    "Open foundation models for code by Meta",
+)
+register_model_info(
+    ["openchat-3.5-0106", "openchat-3.5"],
+    "OpenChat 3.5",
+    "https://github.com/imoneoi/openchat",
+    "An open model fine-tuned on Mistral-7B using C-RLFT",
+)
+register_model_info(
+    ["deepseek-llm-67b-chat"],
+    "DeepSeek LLM",
+    "https://huggingface.co/deepseek-ai/deepseek-llm-67b-chat",
+    "An advanced language model by DeepSeek",
+)
+register_model_info(
+    ["stripedhyena-nous-7b"],
+    "StripedHyena-Nous",
+    "https://huggingface.co/togethercomputer/StripedHyena-Nous-7B",
+    "A chat model developed by Together Research and Nous Research.",
+)
+register_model_info(
+    ["nous-hermes-2-mixtral-8x7b-dpo"],
+    "Nous-Hermes-2-Mixtral-8x7B-DPO",
+    "https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
+    "Nous Hermes finetuned from Mixtral 8x7B",
+)
+register_model_info(
+    ["llama2-70b-steerlm-chat"],
+    "Llama2-70B-SteerLM-Chat",
+    "https://huggingface.co/nvidia/Llama2-70B-SteerLM-Chat",
+    "A Llama fine-tuned with SteerLM method by NVIDIA",
+)
+register_model_info(
+    ["pplx-70b-online", "pplx-7b-online"],
+    "pplx-online-llms",
+    "https://blog.perplexity.ai/blog/introducing-pplx-online-llms",
+    "Online LLM API by Perplexity AI",
+)
+register_model_info(
+    ["openhermes-2.5-mistral-7b"],
+    "OpenHermes-2.5-Mistral-7B",
+    "https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B",
+    "A mistral-based model fine-tuned on 1M GPT-4 outputs",
+)
+register_model_info(
+    ["tulu-2-dpo-70b"],
+    "Tulu 2",
+    "https://huggingface.co/allenai/tulu-2-dpo-70b",
+    "An instruction and RLHF model by UW/AllenAI",
+)
+register_model_info(
+    ["chatglm3-6b", "chatglm2-6b", "chatglm-6b"],
+    "ChatGLM",
+    "https://chatglm.cn/blog",
+    "An open bilingual dialogue language model by Tsinghua University",
+)
+register_model_info(
+    ["tenyxchat-7b-v1"],
+    "TenyxChat-7B",
+    "https://huggingface.co/tenyx/TenyxChat-7B-v1",
+    "An open model DPO trained on top of OpenChat-3.5 using Tenyx fine-tuning",
+)
+register_model_info(
+    ["zephyr-7b-beta", "zephyr-7b-alpha"],
+    "Zephyr",
+    "https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha",
+    "A chatbot fine-tuned from Mistral by Hugging Face",
+)
+register_model_info(
+    ["notus-7b-v1"],
+    "Notus",
+    "https://huggingface.co/argilla/notus-7b-v1",
+    "A chatbot fine-tuned from Zephyr SFT by Argilla",
+)
+register_model_info(
+    ["catppt"],
+    "CatPPT",
+    "https://huggingface.co/rishiraj/CatPPT",
+    "A chatbot fine-tuned from a SLERP merged model by Rishiraj Acharya",
+)
+register_model_info(
+    ["TinyLlama"],
+    "TinyLlama",
+    "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "The TinyLlama project is an open endeavor to pretrain a 1.1B Llama model on 3 trillion tokens.",
+)
+register_model_info(
+    ["wizardlm-70b", "wizardlm-30b", "wizardlm-13b"],
+    "WizardLM",
+    "https://github.com/nlpxucan/WizardLM",
+    "An instruction-following LLM using evol-instruct by Microsoft",
+)
+register_model_info(
+    ["wizardcoder-15b-v1.0"],
+    "WizardLM",
+    "https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder",
+    "Empowering Code Large Language Models with Evol-Instruct",
+)
+register_model_info(
+    ["mpt-7b-chat", "mpt-30b-chat"],
+    "MPT-Chat",
+    "https://www.mosaicml.com/blog/mpt-30b",
+    "A chatbot fine-tuned from MPT by MosaicML",
+)
+register_model_info(
+    ["guanaco-33b", "guanaco-65b"],
+    "Guanaco",
+    "https://github.com/artidoro/qlora",
+    "A model fine-tuned with QLoRA by UW",
+)
+register_model_info(
+    ["gpt4all-13b-snoozy"],
+    "GPT4All-Snoozy",
+    "https://github.com/nomic-ai/gpt4all",
+    "A finetuned LLaMA model on assistant style data by Nomic AI",
+)
+register_model_info(
+    ["koala-13b"],
+    "Koala",
+    "https://bair.berkeley.edu/blog/2023/04/03/koala",
+    "A dialogue model for academic research by BAIR",
+)
+register_model_info(
+    ["RWKV-4-Raven-14B"],
+    "RWKV-4-Raven",
+    "https://huggingface.co/BlinkDL/rwkv-4-raven",
+    "An RNN with transformer-level LLM performance",
+)
+register_model_info(
+    ["alpaca-13b"],
+    "Alpaca",
+    "https://crfm.stanford.edu/2023/03/13/alpaca.html",
+    "A model fine-tuned from LLaMA on instruction-following demonstrations by Stanford",
+)
+register_model_info(
+    ["oasst-pythia-12b"],
+    "OpenAssistant (oasst)",
+    "https://open-assistant.io",
+    "An Open Assistant for everyone by LAION",
+)
+register_model_info(
+    ["oasst-sft-7-llama-30b"],
+    "OpenAssistant (oasst)",
+    "https://open-assistant.io",
+    "An Open Assistant for everyone by LAION",
+)
+register_model_info(
+    ["palm-2"],
+    "PaLM 2 Chat",
+    "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023",
+    "PaLM 2 for Chat (chat-bison@001) by Google",
+)
+register_model_info(
+    ["llama-7b", "llama-13b"],
+    "LLaMA",
+    "https://arxiv.org/abs/2302.13971",
+    "Open and efficient foundation language models by Meta",
+)
+register_model_info(
+    ["open-llama-7b-v2-open-instruct", "open-llama-7b-open-instruct"],
+    "Open LLaMa (Open Instruct)",
+    "https://medium.com/vmware-data-ml-blog/starter-llm-for-the-enterprise-instruction-tuning-openllama-7b-d05fc3bbaccc",
+    "Open LLaMa fine-tuned on instruction-following data by VMware",
+)
+register_model_info(
+    ["dolly-v2-12b"],
+    "Dolly",
+    "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm",
+    "An instruction-tuned open large language model by Databricks",
+)
+register_model_info(
+    ["stablelm-tuned-alpha-7b"],
+    "StableLM",
+    "https://github.com/stability-AI/stableLM",
+    "Stability AI language models",
+)
+register_model_info(
+    ["codet5p-6b"],
+    "CodeT5p-6b",
+    "https://huggingface.co/Salesforce/codet5p-6b",
+    "Code completion model released by Salesforce",
+)
+register_model_info(
+    ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"],
+    "FastChat-T5",
+    "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0",
+    "A chat assistant fine-tuned from FLAN-T5 by LMSYS",
+)
+register_model_info(
+    ["phoenix-inst-chat-7b"],
+    "Phoenix-7B",
+    "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b",
+    "A multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)",
+)
+register_model_info(
+    ["realm-7b-v1"],
+    "ReaLM",
+    "https://github.com/FreedomIntelligence/ReaLM",
+    "A chatbot fine-tuned from LLaMA2 with data generated via iterative calls to UserGPT and ChatGPT by CUHK(SZ) and SRIBD.",
+)
+register_model_info(
+    ["billa-7b-sft"],
+    "BiLLa-7B-SFT",
+    "https://huggingface.co/Neutralzz/BiLLa-7B-SFT",
+    "An instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher",
+)
+register_model_info(
+    ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"],
+    "h2oGPT-GM-7b",
+    "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2",
+    "An instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai",
+)
+register_model_info(
+    ["baize-v2-7b", "baize-v2-13b"],
+    "Baize v2",
+    "https://github.com/project-baize/baize-chatbot#v2",
+    "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.",
+)
+register_model_info(
+    [
+        "airoboros-l2-7b-2.1",
+        "airoboros-l2-13b-2.1",
+        "airoboros-c34b-2.1",
+        "airoboros-l2-70b-2.1",
+    ],
+    "airoboros",
+    "https://huggingface.co/jondurbin/airoboros-l2-70b-2.1",
+    "An instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4",
+)
+register_model_info(
+    [
+        "spicyboros-7b-2.2",
+        "spicyboros-13b-2.2",
+        "spicyboros-70b-2.2",
+    ],
+    "spicyboros",
+    "https://huggingface.co/jondurbin/spicyboros-70b-2.2",
+    "De-aligned versions of the airoboros models",
+)
+register_model_info(
+    ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"],
+    "Robin-v2",
+    "https://huggingface.co/OptimalScale/robin-7b-v2-delta",
+    "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.",
+)
+register_model_info(
+    ["manticore-13b-chat"],
+    "Manticore 13B Chat",
+    "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg",
+    "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.",
+)
+register_model_info(
+    ["redpajama-incite-7b-chat"],
+    "RedPajama-INCITE-7B-Chat",
+    "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat",
+    "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together",
+)
+register_model_info(
+    [
+        "falcon-7b",
+        "falcon-7b-instruct",
+        "falcon-40b",
+        "falcon-40b-instruct",
+        "falcon-180b",
+        "falcon-180b-chat",
+    ],
+    "Falcon",
+    "https://huggingface.co/tiiuae/falcon-180B",
+    "TII's flagship series of large language models",
+)
+register_model_info(
+    ["tigerbot-7b-sft"],
+    "Tigerbot",
+    "https://huggingface.co/TigerResearch/tigerbot-7b-sft",
+    "A large-scale language model (LLM) with multiple languages and tasks.",
+)
+register_model_info(
+    ["internlm-chat-7b", "internlm-chat-7b-8k"],
+    "InternLM",
+    "https://huggingface.co/internlm/internlm-chat-7b",
+    "A multi-language large-scale language model (LLM), developed by SHLAB.",
+)
+register_model_info(
+    ["Qwen-7B-Chat"],
+    "Qwen",
+    "https://huggingface.co/Qwen/Qwen-7B-Chat",
+    "A multi-language large-scale language model (LLM), developed by Damo Academy.",
+)
+register_model_info(
+    ["smaug-2-72b"],
+    "Smaug-2-72B",
+    "https://huggingface.co/abacusai/Smaug-2-72B",
+    "An open model trained by Abacus.AI.",
+)
+register_model_info(
+    ["Llama2-Chinese-13b-Chat", "LLama2-Chinese-13B"],
+    "Llama2-Chinese",
+    "https://huggingface.co/FlagAlpha/Llama2-Chinese-13b-Chat",
+    "A multi-language large-scale language model (LLM), developed by FlagAlpha.",
+)
+register_model_info(
+    ["Meta-Llama-3-8B-Instruct", "Meta-Llama-3-70B-Instruct"],
+    "llama-3",
+    "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct",
+    "Meta developed and released the Meta Llama 3 family of large language models (LLMs), a collection of pretrained and instruction tuned generative text models in 8 and 70B sizes.",
+)
+register_model_info(
+    ["Chinese-Alpaca-2-7B", "Chinese-Alpaca-2-13B"],
+    "Chinese-Alpaca",
+    "https://huggingface.co/hfl/chinese-alpaca-2-13b",
+    "New extended Chinese vocabulary beyond Llama-2, open-sourcing the Chinese LLaMA-2 and Alpaca-2 LLMs.",
+)
+register_model_info(
+    ["Vigogne-2-7B-Instruct", "Vigogne-2-13B-Instruct"],
+    "Vigogne-Instruct",
+    "https://huggingface.co/bofenghuang/vigogne-2-7b-instruct",
+    "A French large language model (LLM) optimized for instruction-following, developed by Bofeng Huang",
+)
+register_model_info(
+    ["Vigogne-2-7B-Chat", "Vigogne-2-13B-Chat"],
+    "Vigogne-Chat",
+    "https://huggingface.co/bofenghuang/vigogne-2-7b-chat",
+    "A French large language model (LLM) optimized for instruction-following and multi-turn dialogues, developed by Bofeng Huang",
+)
+register_model_info(
+    ["stable-vicuna-13B-HF"],
+    "stable-vicuna",
+    "https://huggingface.co/TheBloke/stable-vicuna-13B-HF",
+    "A Vicuna model fine-tuned using RLHF via PPO on various conversational and instructional datasets.",
+)
+register_model_info(
+    ["deluxe-chat-v1", "deluxe-chat-v1.1", "deluxe-chat-v1.2", "deluxe-chat-v1.3"],
+    "DeluxeChat",
+    "",
+    "Deluxe Chat",
+)
+register_model_info(
+    [
+        "Xwin-LM-7B-V0.1",
+        "Xwin-LM-13B-V0.1",
+        "Xwin-LM-70B-V0.1",
+        "Xwin-LM-7B-V0.2",
+        "Xwin-LM-13B-V0.2",
+    ],
+    "Xwin-LM",
+    "https://github.com/Xwin-LM/Xwin-LM",
+    "Chat models developed by Xwin-LM team",
+)
+register_model_info(
+    ["lemur-70b-chat"],
+    "Lemur-Chat",
+    "https://huggingface.co/OpenLemur/lemur-70b-chat-v1",
+    "An openly accessible language model optimized for both natural language and coding capabilities ",
+)
+register_model_info(
+    ["Mistral-7B-OpenOrca"],
+    "Open-Orca",
+    "https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca",
+    "A fine-tune of [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) using [OpenOrca dataset](https://huggingface.co/datasets/Open-Orca/OpenOrca)",
+)
+register_model_info(
+    ["dolphin-2.2.1-mistral-7b"],
+    "dolphin-mistral",
+    "https://huggingface.co/ehartford/dolphin-2.2.1-mistral-7b",
+    "An uncensored fine-tuned Mistral 7B",
+)
+register_model_info(
+    [
+        "AquilaChat-7B",
+        "AquilaChat2-7B",
+        "AquilaChat2-34B",
+    ],
+    "Aquila-Chat",
+    "https://huggingface.co/BAAI/AquilaChat2-34B",
+    "Chat models developed by BAAI team",
+)
+register_model_info(
+    ["xDAN-L1-Chat-RL-v1"],
+    "xDAN-L1-Chat",
+    "https://huggingface.co/xDAN-AI/xDAN-L1-Chat-RL-v1",
+    "A large language chat model created by xDAN-AI.",
+)
+register_model_info(
+    ["MetaMath-70B-V1.0", "MetaMath-7B-V1.0"],
+    "MetaMath",
+    "https://huggingface.co/meta-math",
+    "A finetune of Llama2 on [MetaMathQA](https://huggingface.co/datasets/meta-math/MetaMathQA) that specializes in mathematical reasoning.",
+)
+register_model_info(
+    ["Yuan2-2B-hf", "Yuan2-51B-hf", "Yuan2-102B-hf"],
+    "IEIYuan",
+    "https://huggingface.co/IEITYuan",
+    "A Basemodel developed by IEI.",
+)
+register_model_info(
+    [
+        "llava-v1.6-34b",
+        "llava-v1.6-vicuna-13b",
+        "llava-v1.6-vicuna-7b",
+        "llava-v1.6-mistral-7b",
+        "llava-v1.5-13b",
+        "llava-v1.5-7b",
+    ],
+    "LLaVA",
+    "https://github.com/haotian-liu/LLaVA",
+    "an open large language and vision assistant",
+)
+register_model_info(
+    ["gemma-7b-it", "gemma-2b-it"],
+    "Gemma",
+    "https://blog.google/technology/developers/gemma-open-models/",
+    "Gemma by Google",
+)
+register_model_info(
+    [
+        "cllm/consistency-llm-7b-codesearchnet",
+        "cllm/consistency-llm-7b-gsm8k",
+        "cllm/consistency-llm-7b-sharegpt48k",
+        "cllm/consistency-llm-7b-spider",
+    ],
+    "consistency-llm",
+    "https://huggingface.co/cllm",
+    "consistency-llm is a new generation of parallel decoder LLMs with fast generation speed.",
+)
+register_model_info(
+    ["reka-flash", "reka-flash-20240226"],
+    "Reka Flash",
+    "https://reka.ai/reka-flash",
+    "Multimodal model by Reka",
+)

src/model/model_xfastertransformer.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import gc
+from threading import Thread
+import torch
+from transformers import TextIteratorStreamer
+@torch.inference_mode()
+def generate_stream_xft(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=8192,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    # unused now, and placehold for future.
+    # temperature = float(params.get("temperature", 1.0))
+    # top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_new_tokens", 4096))
+    echo = params.get("echo", True)
+    inputs = tokenizer(
+        prompt, return_tensors="pt", padding=model.config.padding
+    ).input_ids
+    input_echo_len = len(inputs[0])
+    max_len = max_new_tokens + input_echo_len
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
+    generation_kwargs = {
+        "input_ids": inputs,
+        "streamer": streamer,
+        "max_length": max_len,
+        "num_beams": model.config.beam_width,
+        "length_penalty": repetition_penalty,
+        "num_return_sequences": model.config.num_return_sequences,
+        "early_stopping": model.config.early_stopping,
+        "eos_token_id": model.config.eos_token_id,
+        "pad_token_id": model.config.pad_token_id,
+    }
+    thread = Thread(target=model.model.generate, kwargs=generation_kwargs)
+    thread.start()
+    if echo:
+        # means keep the prompt
+        output = prompt
+    else:
+        output = ""
+    i = 0
+    for i, new_text in enumerate(streamer):
+        output += new_text
+        yield {
+            "text": output,
+            "usage": {
+                "prompt_tokens": input_echo_len,
+                "completion_tokens": i,
+                "total_tokens": input_echo_len + i,
+            },
+            "finish_reason": None,
+        }
+    output = output.strip()
+    if i == max_new_tokens - 1:
+        finish_reason = "length"
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    gc.collect()

src/model/model_yuan2.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import gc
+from threading import Thread
+from typing import Iterable
+import torch
+import transformers
+from transformers import TextIteratorStreamer, GenerationConfig
+from fastchat.utils import is_partial_stop
+@torch.inference_mode()
+def generate_stream_yuan2(
+    model,
+    tokenizer,
+    params,
+    device,
+    context_len=2048,
+    stream_interval=2,
+    judge_sent_end=False,
+):
+    prompt = params["prompt"]
+    len_prompt = len(prompt)
+    temperature = float(params.get("temperature", 1))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 0))
+    top_k = int(params.get("top_k", 1))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 512))
+    stop_str = params.get("stop", "<eod>")
+    echo = bool(params.get("echo", True))
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    stop_token_ids.append(tokenizer("<eod>")["input_ids"][0])
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    input_ids = inputs["input_ids"]
+    attention_mask = inputs["attention_mask"]
+    max_src_len = context_len - max_new_tokens - 8
+    input_ids = input_ids[-max_src_len:]  # truncate from the left
+    attention_mask = attention_mask[-max_src_len:]  # truncate from the left
+    input_echo_len = len(input_ids)
+    decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config)
+    generation_config = GenerationConfig(
+        max_new_tokens=max_new_tokens,
+        do_sample=temperature >= 1.2,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=10,
+        top_p=top_p,
+        top_k=top_k,
+    )
+    generation_kwargs = dict(
+        inputs=input_ids,
+        attention_mask=attention_mask,
+        streamer=streamer,
+        generation_config=generation_config,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    if echo:
+        # means keep the prompt
+        output = prompt
+    else:
+        output = ""
+    for i, new_text in enumerate(streamer):
+        output += new_text
+        if i % stream_interval == 0:
+            if echo:
+                rfind_start = len_prompt
+            else:
+                rfind_start = 0
+            partially_stopped = False
+            if stop_str:
+                if isinstance(stop_str, str):
+                    pos = output.rfind(stop_str, rfind_start)
+                    if pos != -1:
+                        output = output[:pos]
+                    else:
+                        partially_stopped = is_partial_stop(output, stop_str)
+                elif isinstance(stop_str, Iterable):
+                    for each_stop in stop_str:
+                        pos = output.rfind(each_stop, rfind_start)
+                        if pos != -1:
+                            output = output[:pos]
+                            break
+                        else:
+                            partially_stopped = is_partial_stop(output, each_stop)
+                            if partially_stopped:
+                                break
+                else:
+                    raise ValueError("Invalid stop field type.")
+            # prevent yielding partial stop sequence
+            if not partially_stopped:
+                yield {
+                    "text": output,
+                    "usage": {
+                        "prompt_tokens": input_echo_len,
+                        "completion_tokens": i,
+                        "total_tokens": input_echo_len + i,
+                    },
+                    "finish_reason": None,
+                }
+    output = output.strip()
+    # finish stream event, which contains finish reason
+    if i == max_new_tokens - 1:
+        finish_reason = "length"
+    elif partially_stopped:
+        finish_reason = None
+    else:
+        finish_reason = "stop"
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+    # clean
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()

src/model/monkey_patch_non_inplace.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""
+Monkey patch the llama implementation in the huggingface/transformers library.
+Avoid bugs in mps backend by not using in-place operations.
+"""
+import math
+from typing import List, Optional, Tuple
+import torch
+from torch import nn
+import transformers
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2].clone()
+    x2 = x[..., x.shape[-1] // 2 :].clone()
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+    past_key_value = (key_states, value_states) if use_cache else None
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(
+        self.head_dim
+    )
+    if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+        raise ValueError(
+            f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+            f" {attn_weights.size()}"
+        )
+    if attention_mask is not None:
+        if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+            )
+        attn_weights = attn_weights + attention_mask
+        attn_weights = torch.max(
+            attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+        )
+    # upcast attention to fp32
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
+        query_states.dtype
+    )
+    attn_output = torch.matmul(attn_weights, value_states)
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+    attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value
+def replace_llama_attn_with_non_inplace_operations():
+    """Avoid bugs in mps backend by not using in-place operations."""
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward

src/model/rwkv_model.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os
+from types import SimpleNamespace
+import warnings
+import torch
+os.environ["RWKV_JIT_ON"] = "1"
+os.environ["RWKV_CUDA_ON"] = "1"
+from rwkv.model import RWKV
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+class RwkvModel:
+    def __init__(self, model_path):
+        warnings.warn(
+            "Experimental support. Please use ChatRWKV if you want to chat with RWKV"
+        )
+        self.config = SimpleNamespace(is_encoder_decoder=False)
+        self.model = RWKV(model=model_path, strategy="cuda fp16")
+        # two GPUs
+        # self.model = RWKV(model=model_path, strategy="cuda:0 fp16 *20 -> cuda:1 fp16")
+        self.tokenizer = None
+        self.model_path = model_path
+    def to(self, target):
+        assert target == "cuda"
+    def __call__(self, input_ids, use_cache, past_key_values=None):
+        assert use_cache == True
+        input_ids = input_ids[0].detach().cpu().numpy()
+        # print(input_ids)
+        logits, state = self.model.forward(input_ids, past_key_values)
+        # print(logits)
+        logits = logits.unsqueeze(0).unsqueeze(0)
+        out = SimpleNamespace(logits=logits, past_key_values=state)
+        return out
+    def generate(
+        self, input_ids, do_sample, temperature, max_new_tokens, repetition_penalty=1.0
+    ):
+        # This function is used by fastchat.llm_judge.
+        # Because RWKV does not support huggingface generation API,
+        # we reuse fastchat.serve.inference.generate_stream as a workaround.
+        from transformers import AutoTokenizer
+        from fastchat.serve.inference import generate_stream
+        from fastchat.conversation import get_conv_template
+        if self.tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "EleutherAI/pythia-160m", use_fast=True
+            )
+        prompt = self.tokenizer.decode(input_ids[0].tolist())
+        conv = get_conv_template("rwkv")
+        gen_params = {
+            "model": self.model_path,
+            "prompt": prompt,
+            "temperature": temperature,
+            "repetition_penalty": repetition_penalty,
+            "max_new_tokens": max_new_tokens,
+            "stop": conv.stop_str,
+            "stop_token_ids": conv.stop_token_ids,
+            "echo": False,
+        }
+        res_iter = generate_stream(self, self.tokenizer, gen_params, "cuda")
+        for res in res_iter:
+            pass
+        output = res["text"]
+        output_ids = self.tokenizer.encode(output)
+        return [input_ids[0].tolist() + output_ids]

src/model/upload_hub.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Upload weights to huggingface.
+Usage:
+python3 -m fastchat.model.upload_hub --model-path ~/model_weights/vicuna-13b --hub-repo-id lmsys/vicuna-13b-v1.3
+"""
+import argparse
+import tempfile
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def upload_hub(model_path, hub_repo_id, component, private):
+    if component == "all":
+        components = ["model", "tokenizer"]
+    else:
+        components = [component]
+    kwargs = {"push_to_hub": True, "repo_id": hub_repo_id, "private": args.private}
+    if "model" in components:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+        )
+        with tempfile.TemporaryDirectory() as tmp_path:
+            model.save_pretrained(tmp_path, **kwargs)
+    if "tokenizer" in components:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        with tempfile.TemporaryDirectory() as tmp_path:
+            tokenizer.save_pretrained(tmp_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, required=True)
+    parser.add_argument(
+        "--component", type=str, choices=["all", "model", "tokenizer"], default="all"
+    )
+    parser.add_argument("--private", action="store_true")
+    args = parser.parse_args()
+    upload_hub(args.model_path, args.hub_repo_id, args.component, args.private)

src/modules/__init__.py ADDED Viewed

File without changes

src/modules/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (190 Bytes). View file

src/modules/__pycache__/awq.cpython-310.pyc ADDED Viewed

Binary file (2.87 kB). View file

src/modules/__pycache__/exllama.cpython-310.pyc ADDED Viewed

Binary file (1.81 kB). View file

src/modules/__pycache__/gptq.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

src/modules/__pycache__/xfastertransformer.cpython-310.pyc ADDED Viewed

Binary file (1.78 kB). View file

src/modules/awq.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from dataclasses import dataclass, field
+from pathlib import Path
+import sys
+import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, modeling_utils
+@dataclass
+class AWQConfig:
+    ckpt: str = field(
+        default=None,
+        metadata={
+            "help": "Load quantized model. The path to the local AWQ checkpoint."
+        },
+    )
+    wbits: int = field(default=16, metadata={"help": "#bits to use for quantization"})
+    groupsize: int = field(
+        default=-1,
+        metadata={"help": "Groupsize to use for quantization; default uses full row."},
+    )
+def load_awq_quantized(model_name, awq_config: AWQConfig, device):
+    print("Loading AWQ quantized model...")
+    try:
+        from tinychat.utils import load_quant
+        from tinychat.modules import make_quant_norm, make_quant_attn, make_fused_mlp
+    except ImportError as e:
+        print(f"Error: Failed to import tinychat. {e}")
+        print("Please double check if you have successfully installed AWQ")
+        print("See https://github.com/lm-sys/FastChat/blob/main/docs/awq.md")
+        sys.exit(-1)
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name, use_fast=False, trust_remote_code=True
+    )
+    def skip(*args, **kwargs):
+        pass
+    torch.nn.init.kaiming_uniform_ = skip
+    torch.nn.init.kaiming_normal_ = skip
+    torch.nn.init.uniform_ = skip
+    torch.nn.init.normal_ = skip
+    modeling_utils._init_weights = False
+    torch.set_default_dtype(torch.half)
+    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
+    if any(name in find_awq_ckpt(awq_config) for name in ["llama", "vicuna"]):
+        model = load_quant.load_awq_llama_fast(
+            model,
+            find_awq_ckpt(awq_config),
+            awq_config.wbits,
+            awq_config.groupsize,
+            device,
+        )
+        make_quant_attn(model, device)
+        make_quant_norm(model)
+        make_fused_mlp(model)
+    else:
+        model = load_quant.load_awq_model(
+            model,
+            find_awq_ckpt(awq_config),
+            awq_config.wbits,
+            awq_config.groupsize,
+            device,
+        )
+    return model, tokenizer
+def find_awq_ckpt(awq_config: AWQConfig):
+    if Path(awq_config.ckpt).is_file():
+        return awq_config.ckpt
+    for ext in ["*.pt", "*.safetensors"]:
+        matched_result = sorted(Path(awq_config.ckpt).glob(ext))
+        if len(matched_result) > 0:
+            return str(matched_result[-1])
+    print("Error: AWQ checkpoint not found")
+    sys.exit(1)