Spaces:
Configuration error
Configuration error
# What this tests ? | |
## Tests /chat/completions by generating a key and then making a chat completions-request | |
import pytest | |
import asyncio | |
import aiohttp, openai | |
from openai import OpenAI, AsyncOpenAI, AzureOpenAI, AsyncAzureOpenAI | |
from typing import Optional, List, Union | |
import uuid | |
LITELLM_MASTER_KEY = "sk-1234" | |
def response_header_check(response): | |
""" | |
- assert if response headers < 4kb (nginx limit). | |
""" | |
headers_size = sum(len(k) + len(v) for k, v in response.raw_headers) | |
assert headers_size < 4096, "Response headers exceed the 4kb limit" | |
async def generate_key( | |
session, | |
models=[ | |
"gpt-4", | |
"text-embedding-ada-002", | |
"dall-e-2", | |
"fake-openai-endpoint-2", | |
"mistral-embed", | |
], | |
): | |
url = "http://0.0.0.0:4000/key/generate" | |
headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"} | |
data = { | |
"models": models, | |
"duration": None, | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
return await response.json() | |
async def new_user(session): | |
url = "http://0.0.0.0:4000/user/new" | |
headers = {"Authorization": "Bearer sk-1234", "Content-Type": "application/json"} | |
data = { | |
"models": ["gpt-4", "text-embedding-ada-002", "dall-e-2"], | |
"duration": None, | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
return await response.json() | |
async def moderation(session, key): | |
url = "http://0.0.0.0:4000/moderations" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = {"input": "I want to kill the cat."} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
return await response.json() | |
async def chat_completion(session, key, model: Union[str, List] = "gpt-4"): | |
url = "http://0.0.0.0:4000/chat/completions" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = { | |
"model": model, | |
"messages": [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": f"Hello! {uuid.uuid4()}"}, | |
], | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
raise Exception( | |
f"Request did not return a 200 status code: {status}, response text={response_text}" | |
) | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
return await response.json() | |
async def queue_chat_completion( | |
session, key, priority: int, model: Union[str, List] = "gpt-4" | |
): | |
url = "http://0.0.0.0:4000/queue/chat/completions" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = { | |
"model": model, | |
"messages": [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Hello!"}, | |
], | |
"priority": priority, | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
return response.raw_headers | |
async def chat_completion_with_headers(session, key, model="gpt-4"): | |
url = "http://0.0.0.0:4000/chat/completions" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = { | |
"model": model, | |
"messages": [ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": "Hello!"}, | |
], | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
raw_headers = response.raw_headers | |
raw_headers_json = {} | |
for ( | |
item | |
) in ( | |
response.raw_headers | |
): # ((b'date', b'Fri, 19 Apr 2024 21:17:29 GMT'), (), ) | |
raw_headers_json[item[0].decode("utf-8")] = item[1].decode("utf-8") | |
return raw_headers_json | |
async def chat_completion_with_model_from_route(session, key, route): | |
url = "http://0.0.0.0:4000/chat/completions" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
async def completion(session, key): | |
url = "http://0.0.0.0:4000/completions" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = {"model": "gpt-4", "prompt": "Hello!"} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
response = await response.json() | |
return response | |
async def embeddings(session, key, model="text-embedding-ada-002"): | |
url = "http://0.0.0.0:4000/embeddings" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = { | |
"model": model, | |
"input": ["hello world"], | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
if status != 200: | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
async def image_generation(session, key): | |
url = "http://0.0.0.0:4000/images/generations" | |
headers = { | |
"Authorization": f"Bearer {key}", | |
"Content-Type": "application/json", | |
} | |
data = { | |
"model": "dall-e-2", | |
"prompt": "A cute baby sea otter", | |
} | |
async with session.post(url, headers=headers, json=data) as response: | |
status = response.status | |
response_text = await response.text() | |
print(response_text) | |
print() | |
if status != 200: | |
if ( | |
"Connection error" in response_text | |
): # OpenAI endpoint returns a connection error | |
return | |
raise Exception(f"Request did not return a 200 status code: {status}") | |
response_header_check( | |
response | |
) # calling the function to check response headers | |
async def test_chat_completion(): | |
""" | |
- Create key | |
Make chat completion call | |
- Create user | |
make chat completion call | |
""" | |
async with aiohttp.ClientSession() as session: | |
key_gen = await generate_key(session=session, models=["gpt-3.5-turbo"]) | |
azure_client = AsyncAzureOpenAI( | |
azure_endpoint="http://0.0.0.0:4000", | |
azure_deployment="random-model", | |
api_key=key_gen["key"], | |
api_version="2024-02-15-preview", | |
) | |
with pytest.raises(openai.AuthenticationError) as e: | |
response = await azure_client.chat.completions.create( | |
model="gpt-4", | |
messages=[{"role": "user", "content": "Hello!"}], | |
) | |
assert "key not allowed to access model." in str(e) | |
async def test_chat_completion_ratelimit(): | |
""" | |
- call model with rpm 1 | |
- make 2 parallel calls | |
- make sure 1 fails | |
""" | |
async with aiohttp.ClientSession() as session: | |
# key_gen = await generate_key(session=session) | |
key = "sk-1234" | |
tasks = [] | |
tasks.append( | |
chat_completion(session=session, key=key, model="fake-openai-endpoint-2") | |
) | |
tasks.append( | |
chat_completion(session=session, key=key, model="fake-openai-endpoint-2") | |
) | |
try: | |
await asyncio.gather(*tasks) | |
pytest.fail("Expected at least 1 call to fail") | |
except Exception as e: | |
if "Request did not return a 200 status code: 429" in str(e): | |
pass | |
else: | |
pytest.fail(f"Wrong error received - {str(e)}") | |
async def test_chat_completion_different_deployments(): | |
""" | |
- call model group with 2 deployments | |
- make 5 calls | |
- expect 2 unique deployments | |
""" | |
async with aiohttp.ClientSession() as session: | |
# key_gen = await generate_key(session=session) | |
key = "sk-1234" | |
results = [] | |
for _ in range(20): | |
results.append( | |
await chat_completion_with_headers( | |
session=session, key=key, model="fake-openai-endpoint-3" | |
) | |
) | |
try: | |
print(f"results: {results}") | |
init_model_id = results[0]["x-litellm-model-id"] | |
deployments_shuffled = False | |
for result in results[1:]: | |
if init_model_id != result["x-litellm-model-id"]: | |
deployments_shuffled = True | |
if deployments_shuffled == False: | |
pytest.fail("Expected at least 1 shuffled call") | |
except Exception as e: | |
pass | |
async def test_chat_completion_streaming(): | |
""" | |
[PROD Test] Ensures logprobs are returned correctly | |
""" | |
client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") | |
response = await client.chat.completions.create( | |
model="gpt-3.5-turbo-large", | |
messages=[{"role": "user", "content": "Hello!"}], | |
logprobs=True, | |
top_logprobs=2, | |
stream=True, | |
) | |
response_str = "" | |
async for chunk in response: | |
response_str += chunk.choices[0].delta.content or "" | |
print(f"response_str: {response_str}") | |
async def test_completion_streaming_usage_metrics(): | |
""" | |
[PROD Test] Ensures usage metrics are returned correctly when `include_usage` is set to `True` | |
""" | |
client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") | |
response = await client.completions.create( | |
model="gpt-instruct", | |
prompt="hey", | |
stream=True, | |
stream_options={"include_usage": True}, | |
max_tokens=4, | |
temperature=0.00000001, | |
) | |
last_chunk = None | |
async for chunk in response: | |
print("chunk", chunk) | |
last_chunk = chunk | |
assert last_chunk is not None, "No chunks were received" | |
assert last_chunk.usage is not None, "Usage information was not received" | |
assert last_chunk.usage.prompt_tokens > 0, "Prompt tokens should be greater than 0" | |
assert ( | |
last_chunk.usage.completion_tokens > 0 | |
), "Completion tokens should be greater than 0" | |
assert last_chunk.usage.total_tokens > 0, "Total tokens should be greater than 0" | |
async def test_chat_completion_anthropic_structured_output(): | |
""" | |
Ensure nested pydantic output is returned correctly | |
""" | |
from pydantic import BaseModel | |
class CalendarEvent(BaseModel): | |
name: str | |
date: str | |
participants: list[str] | |
class EventsList(BaseModel): | |
events: list[CalendarEvent] | |
messages = [ | |
{"role": "user", "content": "List 5 important events in the XIX century"} | |
] | |
client = AsyncOpenAI(api_key="sk-1234", base_url="http://0.0.0.0:4000") | |
res = await client.beta.chat.completions.parse( | |
model="bedrock/us.anthropic.claude-3-sonnet-20240229-v1:0", | |
messages=messages, | |
response_format=EventsList, | |
timeout=60, | |
) | |
message = res.choices[0].message | |
if message.parsed: | |
print(message.parsed.events) | |
async def test_chat_completion_old_key(): | |
""" | |
Production test for backwards compatibility. Test db against a pre-generated (old key) | |
- Create key | |
Make chat completion call | |
""" | |
async with aiohttp.ClientSession() as session: | |
try: | |
key = "sk--W0Ph0uDZLVD7V7LQVrslg" | |
await chat_completion(session=session, key=key) | |
except Exception as e: | |
pytest.fail("Invalid api key") | |
async def test_completion(): | |
""" | |
- Create key | |
Make chat completion call | |
- Create user | |
make chat completion call | |
""" | |
async with aiohttp.ClientSession() as session: | |
key_gen = await generate_key(session=session) | |
key = key_gen["key"] | |
await completion(session=session, key=key) | |
key_gen = await new_user(session=session) | |
key_2 = key_gen["key"] | |
# response = await completion(session=session, key=key_2) | |
## validate openai format ## | |
client = OpenAI(api_key=key_2, base_url="http://0.0.0.0:4000") | |
client.completions.create( | |
model="gpt-4", | |
prompt="Say this is a test", | |
max_tokens=7, | |
temperature=0, | |
) | |
async def test_embeddings(): | |
""" | |
- Create key | |
Make embeddings call | |
- Create user | |
make embeddings call | |
""" | |
async with aiohttp.ClientSession() as session: | |
key_gen = await generate_key(session=session) | |
key = key_gen["key"] | |
await embeddings(session=session, key=key) | |
key_gen = await new_user(session=session) | |
key_2 = key_gen["key"] | |
await embeddings(session=session, key=key_2) | |
# embedding request with non OpenAI model | |
await embeddings(session=session, key=key, model="mistral-embed") | |
async def test_image_generation(): | |
""" | |
- Create key | |
Make embeddings call | |
- Create user | |
make embeddings call | |
""" | |
async with aiohttp.ClientSession() as session: | |
key_gen = await generate_key(session=session) | |
key = key_gen["key"] | |
await image_generation(session=session, key=key) | |
key_gen = await new_user(session=session) | |
key_2 = key_gen["key"] | |
await image_generation(session=session, key=key_2) | |
async def test_openai_wildcard_chat_completion(): | |
""" | |
- Create key for model = "*" -> this has access to all models | |
- proxy_server_config.yaml has model = * | |
- Make chat completion call | |
""" | |
async with aiohttp.ClientSession() as session: | |
key_gen = await generate_key(session=session, models=["*"]) | |
key = key_gen["key"] | |
# call chat/completions with a model that the key was not created for + the model is not on the config.yaml | |
await chat_completion(session=session, key=key, model="gpt-3.5-turbo-0125") | |
async def test_proxy_all_models(): | |
""" | |
- proxy_server_config.yaml has model = * / * | |
- Make chat completion call | |
- groq is NOT defined on /models | |
""" | |
async with aiohttp.ClientSession() as session: | |
# call chat/completions with a model that the key was not created for + the model is not on the config.yaml | |
await chat_completion( | |
session=session, key=LITELLM_MASTER_KEY, model="groq/llama3-8b-8192" | |
) | |
await chat_completion( | |
session=session, | |
key=LITELLM_MASTER_KEY, | |
model="anthropic/claude-3-sonnet-20240229", | |
) | |
async def test_batch_chat_completions(): | |
""" | |
- Make chat completion call using | |
""" | |
async with aiohttp.ClientSession() as session: | |
# call chat/completions with a model that the key was not created for + the model is not on the config.yaml | |
response = await chat_completion( | |
session=session, | |
key="sk-1234", | |
model="gpt-3.5-turbo,fake-openai-endpoint", | |
) | |
print(f"response: {response}") | |
assert len(response) == 2 | |
assert isinstance(response, list) | |
async def test_moderations_endpoint(): | |
""" | |
- Make chat completion call using | |
""" | |
async with aiohttp.ClientSession() as session: | |
# call chat/completions with a model that the key was not created for + the model is not on the config.yaml | |
response = await moderation( | |
session=session, | |
key="sk-1234", | |
) | |
print(f"response: {response}") | |
assert "results" in response | |