import os from dataclasses import dataclass @dataclass(unsafe_hash=True) class Model(object): model_display_name: str model_name: str api_url: str provider: str hourly_cost: int = None cost_description: str = None supports_functions: str = False size_billion_parameters: int = None # in billion paramters cost_per_million_tokens: int = None cost_per_million_input_tokens: int = None cost_per_million_output_tokens: int = None input_size: int = None def __post_init__(self): self.cost_per_million_input_tokens = self.cost_per_million_input_tokens or self.cost_per_million_tokens self.cost_per_million_output_tokens = self.cost_per_million_output_tokens or self.cost_per_million_tokens @property def cost(self): if self.cost_description: return self.cost_description if self.hourly_cost: return f"${self.hourly_cost:.2g} / hour" if self.cost_per_million_tokens: return f"${self.cost_per_million_tokens:.2g} / 1M tokens" elif self.cost_per_million_input_tokens and self.cost_per_million_output_tokens: return f"${self.cost_per_million_input_tokens:.2g} / 1M input tokens, ${self.cost_per_million_output_tokens:.2g} / 1M output tokens" env = os.environ.get MODELS = [ # source: https://openai.com/pricing # converted costs from dollar/1K tokens to dollar/1M for readability and together_ai comparability Model( "gpt-3.5-turbo", "gpt-3.5-turbo", None, "OpenAI", supports_functions=True, cost_per_million_input_tokens=1, cost_per_million_output_tokens=2, # https://learn.microsoft.com/en-us/answers/questions/1356487/what-is-the-exact-maximum-input-tokens-of-azure-gp input_size=4096 ), Model( "gpt-4-turbo", "gpt-4-1106-preview", None, "OpenAI", supports_functions=True, cost_per_million_input_tokens=10, cost_per_million_output_tokens=30, # https://writesonic.com/blog/gpt-4-turbo-vs-gpt-4 input_size=128_000, ), Model( "gpt-4", "gpt-4", None, "OpenAI", supports_functions=True, cost_per_million_input_tokens=30, cost_per_million_output_tokens=60, input_size=32_000, ), # source: https://www.together.ai/pricing Model( "llama-2-70b-chat", "together_ai/togethercomputer/llama-2-70b-chat", None, "Together AI", cost_per_million_tokens=0.9, size_billion_parameters=70, # https://github.com/facebookresearch/llama/issues/148 input_size=2048, ), Model( "Mixtral-8x7B-Instruct-v0.1", "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1", None, "Together AI", size_billion_parameters=8 * 7, cost_per_million_tokens=0.9, ), # taken from endpoint pages Model( "zephyr-7b-beta", "huggingface/HuggingFaceH4/zephyr-7b-beta", env("ZEPHYR_7B_BETA_URL"), "Hugging Face Inference Endpoint", hourly_cost=1.30, size_billion_parameters=7, ), Model( "Mistral-7B-Instruct-v0.2", "huggingface/mistralai/Mistral-7B-Instruct-v0.2", env("MISTRAL_7B_BETA_URL"), "Hugging Face Inference Endpoint", hourly_cost=1.30, size_billion_parameters=7, ), Model( "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "huggingface/TinyLlama/TinyLlama-1.1B-Chat-v1.0", env("TINY_LLAMA_URL"), "Hugging Face Inference Endpoint", hourly_cost=0.60, size_billion_parameters=1.1, ), Model( "gemini-pro", "gemini-pro", None, "Google VertexAI", # https://ai.google.dev/pricing cost_description="$0.25 / 1M input characters, $0.5 / 1M output characters (60 queries per minute are free)", cost_per_million_input_tokens=0.25, cost_per_million_output_tokens=0.5, ), Model( "chat-bison (PaLM 2)", "chat-bison", None, "Google VertexAI", # https://cloud.google.com/vertex-ai/docs/generative-ai/pricing cost_per_million_input_tokens=0.25, cost_per_million_output_tokens=0.5, # https://ai.google.dev/models/palm input_size=8196, ), Model( "chat-bison-32k (PaLM 2 32K)", "chat-bison-32k", None, "Google VertexAI", # https://cloud.google.com/vertex-ai/docs/generative-ai/pricing cost_per_million_input_tokens=0.25, cost_per_million_output_tokens=0.5, ), ] MODELS = [model for model in MODELS if model.model_name=="together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1" or model.model_name=="huggingface/HuggingFaceH4/zephyr-7b-beta"]