Spaces:

neoai-inc-private
/

neo-llm-module-v1.3.5

Configuration error

neo-llm-module-v1.3.5

File size: 8,382 Bytes

88435ed

import json
import textwrap
from typing import Any, Iterator, overload

import tiktoken

from neollm.types import Function
from neollm.utils.utils import cprint  # , Functions, Messages

DEFAULT_MODEL_NAME = "gpt-3.5-turbo"


def get_tokenizer(model_name: str) -> tiktoken.Encoding:
    # 参考: https://platform.openai.com/docs/models/gpt-3-5
    MODEL_NAME_MAP = [
        ("gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613"),
        ("gpt-3.5-turbo", "gpt-3.5-turbo-0613"),
        ("gpt-4-32k", "gpt-4-32k-0613"),
        ("gpt-4", "gpt-4-0613"),
    ]
    ALL_VERSION_MODELS = [
        # gpt-3.5-turbo
        "gpt-3.5-turbo-0125",
        "gpt-3.5-turbo-1106",
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-3.5-turbo-0301",  # Legacy
        # gpt-4
        "gpt-4o-2024-05-13",
        "gpt-4-turbo-0125",
        "gpt-4-turbo-1106",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        "gpt-4-0314",  # Legacy
        "gpt-4-32k-0314",  # Legacy
    ]
    # Azure表記 → OpenAI表記に統一
    model_name = model_name.replace("gpt-35", "gpt-3.5")
    # 最新モデルを正式名称に & 新モデル, FTモデルをキャッチ
    if model_name not in ALL_VERSION_MODELS:
        for key, model_name_version in MODEL_NAME_MAP:
            if key in model_name:
                model_name = model_name_version
                break
    try:
        return tiktoken.encoding_for_model(model_name)
    except Exception as e:
        cprint(f"WARNING: Tokenizerの取得に失敗。{model_name}: {e}", color="yellow", background=True)
        return tiktoken.encoding_for_model("gpt-3.5-turbo")


@overload
def count_tokens(messages: str, model_name: str | None = None) -> int: ...


@overload
def count_tokens(
    messages: Iterator[dict[str, str]], model_name: str | None = None, functions: Any | None = None
) -> int: ...


def count_tokens(
    messages: Iterator[dict[str, str]] | str,
    model_name: str | None = None,
    functions: Any | None = None,
) -> int:
    if isinstance(messages, str):
        tokenizer = get_tokenizer(model_name or DEFAULT_MODEL_NAME)
        encoded = tokenizer.encode(messages)
        return len(encoded)
    return _count_messages_and_function_tokens(messages, model_name, functions)


def _count_messages_and_function_tokens(
    messages: Iterator[dict[str, str]], model_name: str | None = None, functions: Any | None = None
) -> int:
    """トークン数計測

    Args:
        messages (Messages): GPTAPIの入力のmessages
        model_name (str | None, optional): モデル名. Defaults to None.
        functions (Functions | None, optional): GPTAPIの入力のfunctions. Defaults to None.

    Returns:
        int: トークン数
    """
    num_tokens = _count_messages_tokens(messages, model_name or DEFAULT_MODEL_NAME)
    if functions is not None:
        num_tokens += _count_functions_tokens(functions, model_name)
    return num_tokens


# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def _count_messages_tokens(messages: Iterator[dict[str, str]] | None, model_name: str) -> int:
    """メッセージのトークン数を計算

    Args:
        messages (Messages): ChatGPT等APIに入力するmessages
        model_name (str, optional): 使用するモデルの名前
            "gpt-3.5-turbo-0613", "gpt-3.5-turbo-16k-0613", "gpt-4-0314", "gpt-4-32k-0314"
            "gpt-4-0613", "gpt-4-32k-0613", "gpt-3.5-turbo", "gpt-4"

    Returns:
        int: トークン数の合計
    """
    if messages is None:
        return 0
    # setting model
    encoding_model = get_tokenizer(model_name)

    # config
    if "gpt-3.5-turbo-0301" in model_name:
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    else:
        tokens_per_message = 3
        tokens_per_name = 1

    # count tokens
    num_tokens = 3  # every reply is primed with <|start|>assistant<|message|>
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            if isinstance(value, str):
                num_tokens += len(encoding_model.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    return num_tokens


# https://gist.github.com/CGamesPlay/dd4f108f27e2eec145eedf5c717318f5
def _count_functions_tokens(functions: Any, model_name: str | None = None) -> int:
    """
    functionsのトークン数計測

    Args:
        functions (Functions): GPTAPIの入力のfunctions
        model_name (str | None, optional): モデル名. Defaults to None.

    Returns:
        _type_: トークン数
    """
    encoding_model = encoding_model = get_tokenizer(model_name or DEFAULT_MODEL_NAME)
    num_tokens = 3 + len(encoding_model.encode(__functions2string(functions)))
    return num_tokens


# functionsのstring化、補助関数 ---------------------------------------------------------------------------
def __functions2string(functions: Any) -> str:
    """functionsの文字列化

    Args:
        functions (Functions): GPTAPIの入力のfunctions

    Returns:
        str: functionsの文字列
    """
    prefix = "# Tools\n\n## functions\n\nnamespace functions {\n\n} // namespace functions\n"
    functions_string = prefix + "".join(__function2string(function) for function in functions)
    return functions_string


def __function2string(function: Function) -> str:
    """functionの文字列化

    Args:
        function (Function): GPTAPIのfunctionの要素

    Returns:
        str: functionの文字列
    """
    object_string = __format_object(function["parameters"])
    if object_string is not None:
        object_string = "_: " + object_string
    else:
        object_string = ""

    functions_string: str = (
        f"// {function['description']}\ntype {function['name']} = (" + object_string + ") => any;\n\n"
    )
    return functions_string


def __format_object(schema: dict[str, Any], indent: int = 0) -> str | None:
    if "properties" not in schema or len(schema["properties"]) == 0:
        if schema.get("additionalProperties", False):
            return "object"
        return None

    result = "{\n"
    for key, value in dict(schema["properties"]).items():
        # value <- resolve_ref(value)
        value_rendered = __format_schema(value, indent + 1)
        if value_rendered is None:
            continue
        # description
        if "description" in value:
            description = "".join(
                "  " * indent + f"// {description_i}\n"
                for description_i in textwrap.dedent(value["description"]).strip().split("\n")
            )
        # optional
        optional = "" if key in schema.get("required", {}) else "?"
        # default
        default_comment = "" if "default" not in value else f" // default: {__format_default(value)}"
        # add string
        result += description + "  " * indent + f"{key}{optional}: {value_rendered},{default_comment}\n"
    result += ("  " * (indent - 1)) + "}"
    return result


# よくわからん
# def resolve_ref(schema):
#     if schema.get("$ref") is not None:
#         ref = schema["$ref"][14:]
#         schema = json_schema["definitions"][ref]
#     return schema


def __format_schema(schema: dict[str, Any], indent: int) -> str | None:
    # schema <- resolve_ref(schema)
    if "enum" in schema:
        return __format_enum(schema)
    elif schema["type"] == "object":
        return __format_object(schema, indent)
    elif schema["type"] in {"integer", "number"}:
        return "number"
    elif schema["type"] in {"string"}:
        return "string"
    elif schema["type"] == "array":
        return str(__format_schema(schema["items"], indent)) + "[]"
    else:
        raise ValueError("unknown schema type " + schema["type"])


def __format_enum(schema: dict[str, Any]) -> str:
    # "A" | "B" | "C"
    return " | ".join(json.dumps(element, ensure_ascii=False) for element in schema["enum"])


def __format_default(schema: dict[str, Any]) -> str:
    default = schema["default"]
    if schema["type"] == "number" and float(default).is_integer():
        # numberの時、0 → 0.0
        return f"{default:.1f}"
    else:
        return str(default)