""" Test HuggingFace LLM """ from base_llm_unit_tests import BaseLLMChatTest import json import os import sys from unittest.mock import patch, MagicMock, AsyncMock sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path import litellm import pytest from litellm.types.utils import ModelResponseStream, ModelResponse MOCK_COMPLETION_RESPONSE = { "id": "9115d3daeab10608", "object": "chat.completion", "created": 11111, "model": "meta-llama/Meta-Llama-3-8B-Instruct", "prompt": [], "choices": [ { "finish_reason": "stop", "seed": 3629048360264764400, "logprobs": None, "index": 0, "message": { "role": "assistant", "content": "This is a test response from the mocked HuggingFace API.", "tool_calls": [], }, } ], "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30}, } MOCK_STREAMING_CHUNKS = [ { "id": "id1", "object": "chat.completion.chunk", "created": 1111, "choices": [ { "index": 0, "text": "Deep", "logprobs": None, "finish_reason": None, "seed": None, "delta": { "token_id": 34564, "role": "assistant", "content": "Deep", "tool_calls": None, }, } ], "model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": None, }, { "id": "id2", "object": "chat.completion.chunk", "created": 1111, "choices": [ { "index": 0, "text": " learning", "logprobs": None, "finish_reason": None, "seed": None, "delta": { "token_id": 6975, "role": "assistant", "content": " learning", "tool_calls": None, }, } ], "model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": None, }, { "id": "id3", "object": "chat.completion.chunk", "created": 1111, "choices": [ { "index": 0, "text": " is", "logprobs": None, "finish_reason": None, "seed": None, "delta": { "token_id": 374, "role": "assistant", "content": " is", "tool_calls": None, }, } ], "model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": None, }, { "id": "sid4", "object": "chat.completion.chunk", "created": 1111, "choices": [ { "index": 0, "text": " response", "logprobs": None, "finish_reason": "length", "seed": 2853637492034609700, "delta": { "token_id": 323, "role": "assistant", "content": " response", "tool_calls": None, }, } ], "model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": {"prompt_tokens": 26, "completion_tokens": 20, "total_tokens": 46}, }, ] PROVIDER_MAPPING_RESPONSE = { "fireworks-ai": { "status": "live", "providerId": "accounts/fireworks/models/llama-v3-8b-instruct", "task": "conversational", }, "together": { "status": "live", "providerId": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "task": "conversational", }, "hf-inference": { "status": "live", "providerId": "meta-llama/Meta-Llama-3-8B-Instruct", "task": "conversational", }, } @pytest.fixture def mock_provider_mapping(): with patch( "litellm.llms.huggingface.chat.transformation._fetch_inference_provider_mapping" ) as mock: mock.return_value = PROVIDER_MAPPING_RESPONSE yield mock @pytest.fixture(autouse=True) def clear_lru_cache(): from litellm.llms.huggingface.common_utils import _fetch_inference_provider_mapping _fetch_inference_provider_mapping.cache_clear() yield _fetch_inference_provider_mapping.cache_clear() @pytest.fixture def mock_http_handler(): """Fixture to mock the HTTP handler""" with patch("litellm.llms.custom_httpx.http_handler.HTTPHandler.post") as mock: print(f"Creating mock HTTP handler: {mock}") # noqa: T201 mock_response = MagicMock() mock_response.raise_for_status.return_value = None mock_response.status_code = 200 def mock_side_effect(*args, **kwargs): if kwargs.get("stream", True): mock_response.iter_lines.return_value = iter( [ f"data: {json.dumps(chunk)}".encode("utf-8") for chunk in MOCK_STREAMING_CHUNKS ] + [b"data: [DONE]"] ) else: mock_response.json.return_value = MOCK_COMPLETION_RESPONSE return mock_response mock.side_effect = mock_side_effect yield mock @pytest.fixture def mock_http_async_handler(): """Fixture to mock the async HTTP handler""" with patch( "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post", new_callable=AsyncMock, ) as mock: print(f"Creating mock async HTTP handler: {mock}") # noqa: T201 mock_response = MagicMock() mock_response.raise_for_status.return_value = None mock_response.status_code = 200 mock_response.headers = {"content-type": "application/json"} mock_response.json.return_value = MOCK_COMPLETION_RESPONSE mock_response.text = json.dumps(MOCK_COMPLETION_RESPONSE) async def mock_side_effect(*args, **kwargs): if kwargs.get("stream", True): async def mock_aiter(): for chunk in MOCK_STREAMING_CHUNKS: yield f"data: {json.dumps(chunk)}".encode("utf-8") yield b"data: [DONE]" mock_response.aiter_lines = mock_aiter return mock_response mock.side_effect = mock_side_effect yield mock class TestHuggingFace(BaseLLMChatTest): @pytest.fixture(autouse=True) def setup(self, mock_provider_mapping, mock_http_handler, mock_http_async_handler): self.mock_provider_mapping = mock_provider_mapping self.mock_http = mock_http_handler self.mock_http_async = mock_http_async_handler self.model = "huggingface/together/meta-llama/Meta-Llama-3-8B-Instruct" litellm.set_verbose = False def get_base_completion_call_args(self) -> dict: """Implementation of abstract method from BaseLLMChatTest""" return {"model": self.model} def test_completion_non_streaming(self): messages = [{"role": "user", "content": "This is a dummy message"}] response = litellm.completion(model=self.model, messages=messages, stream=False) assert isinstance(response, ModelResponse) assert ( response.choices[0].message.content == "This is a test response from the mocked HuggingFace API." ) assert response.usage is not None assert response.model == self.model.split("/", 2)[2] def test_completion_streaming(self): messages = [{"role": "user", "content": "This is a dummy message"}] response = litellm.completion(model=self.model, messages=messages, stream=True) chunks = list(response) assert len(chunks) > 0 assert self.mock_http.called call_args = self.mock_http.call_args assert call_args is not None kwargs = call_args[1] data = json.loads(kwargs["data"]) assert data["stream"] is True assert data["messages"] == messages assert isinstance(chunks, list) assert isinstance(chunks[0], ModelResponseStream) assert isinstance(chunks[0].id, str) assert chunks[0].model == self.model.split("/", 1)[1] @pytest.mark.asyncio async def test_async_completion_streaming(self): """Test async streaming completion""" messages = [{"role": "user", "content": "This is a dummy message"}] response = await litellm.acompletion( model=self.model, messages=messages, stream=True ) chunks = [] async for chunk in response: chunks.append(chunk) assert self.mock_http_async.called assert len(chunks) > 0 assert isinstance(chunks[0], ModelResponseStream) assert isinstance(chunks[0].id, str) assert chunks[0].model == self.model.split("/", 1)[1] @pytest.mark.asyncio async def test_async_completion_non_streaming(self): """Test async non-streaming completion""" messages = [{"role": "user", "content": "This is a dummy message"}] response = await litellm.acompletion( model=self.model, messages=messages, stream=False ) assert self.mock_http_async.called assert isinstance(response, ModelResponse) assert ( response.choices[0].message.content == "This is a test response from the mocked HuggingFace API." ) assert response.usage is not None assert response.model == self.model.split("/", 2)[2] def test_tool_call_no_arguments(self, tool_call_no_arguments): mock_tool_response = { **MOCK_COMPLETION_RESPONSE, "choices": [ { "finish_reason": "tool_calls", "index": 0, "message": tool_call_no_arguments, } ], } with patch.object( self.mock_http, "side_effect", lambda *args, **kwargs: MagicMock( status_code=200, json=lambda: mock_tool_response, raise_for_status=lambda: None, ), ): messages = [{"role": "user", "content": "Get the FAQ"}] tools = [ { "type": "function", "function": { "name": "Get-FAQ", "description": "Get FAQ information", "parameters": { "type": "object", "properties": {}, "required": [], }, }, } ] response = litellm.completion( model=self.model, messages=messages, tools=tools, tool_choice="auto" ) assert response.choices[0].message.tool_calls is not None assert len(response.choices[0].message.tool_calls) == 1 assert ( response.choices[0].message.tool_calls[0].function.name == tool_call_no_arguments["tool_calls"][0]["function"]["name"] ) assert ( response.choices[0].message.tool_calls[0].function.arguments == tool_call_no_arguments["tool_calls"][0]["function"]["arguments"] ) @pytest.mark.parametrize( "model, provider, expected_url", [ ( "meta-llama/Llama-3-8B-Instruct", None, "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct/v1/chat/completions", ), ( "together/meta-llama/Llama-3-8B-Instruct", None, "https://router.huggingface.co/together/v1/chat/completions", ), ( "novita/meta-llama/Llama-3-8B-Instruct", None, "https://router.huggingface.co/novita/v3/openai/chat/completions", ), ( "http://custom-endpoint.com/v1/chat/completions", None, "http://custom-endpoint.com/v1/chat/completions", ), ], ) def test_get_complete_url(self, model, provider, expected_url): """Test that the complete URL is constructed correctly for different providers""" from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig config = HuggingFaceChatConfig() url = config.get_complete_url( api_base=None, model=model, optional_params={}, stream=False, api_key="test_api_key", litellm_params={}, ) assert url == expected_url @pytest.mark.parametrize( "api_base, model, expected_url", [ ( "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud", "huggingface/tgi", "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions", ), ( "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud/", "huggingface/tgi", "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions", ), ( "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions", "huggingface/tgi", "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions", ), ( "https://example.com/custom/path", "huggingface/tgi", "https://example.com/custom/path/v1/chat/completions", ), ( "https://example.com/custom/path/v1/chat/completions", "huggingface/tgi", "https://example.com/custom/path/v1/chat/completions", ), ( "https://example.com/v1", "huggingface/tgi", "https://example.com/v1/chat/completions", ), ], ) def test_get_complete_url_inference_endpoints(self, api_base, model, expected_url): from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig config = HuggingFaceChatConfig() url = config.get_complete_url( api_base=api_base, model=model, optional_params={}, stream=False, api_key="test_api_key", litellm_params={}, ) assert url == expected_url def test_completion_with_api_base(self): messages = [{"role": "user", "content": "This is a test message"}] api_base = "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud" response = litellm.completion( model="huggingface/tgi", messages=messages, api_base=api_base, stream=False ) assert isinstance(response, ModelResponse) assert ( response.choices[0].message.content == "This is a test response from the mocked HuggingFace API." ) assert self.mock_http.called call_args = self.mock_http.call_args assert call_args is not None called_url = call_args[1]["url"] assert called_url == f"{api_base}/v1/chat/completions" @pytest.mark.asyncio async def test_async_completion_with_api_base(self): messages = [{"role": "user", "content": "This is a test message"}] api_base = "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud" response = await litellm.acompletion( model="huggingface/tgi", messages=messages, api_base=api_base, stream=False ) assert isinstance(response, ModelResponse) assert ( response.choices[0].message.content == "This is a test response from the mocked HuggingFace API." ) assert self.mock_http_async.called call_args = self.mock_http_async.call_args assert call_args is not None called_url = call_args[1]["url"] assert called_url == f"{api_base}/v1/chat/completions" def test_completion_streaming_with_api_base(self): """Test streaming completion with api_base parameter""" messages = [{"role": "user", "content": "This is a test message"}] api_base = "https://abcd123.us-east-1.aws.endpoints.huggingface.cloud" response = litellm.completion( model="huggingface/tgi", messages=messages, api_base=api_base, stream=True ) chunks = list(response) assert len(chunks) > 0 assert isinstance(chunks[0], ModelResponseStream) # Check that the correct URL was called assert self.mock_http.called call_args = self.mock_http.call_args assert call_args is not None called_url = call_args[1]["url"] assert called_url == f"{api_base}/v1/chat/completions" def test_build_chat_completion_url_function(self): """Test the _build_chat_completion_url helper function""" from litellm.llms.huggingface.chat.transformation import ( _build_chat_completion_url, ) test_cases = [ ("https://example.com", "https://example.com/v1/chat/completions"), ("https://example.com/", "https://example.com/v1/chat/completions"), ("https://example.com/v1", "https://example.com/v1/chat/completions"), ("https://example.com/v1/", "https://example.com/v1/chat/completions"), ( "https://example.com/v1/chat/completions", "https://example.com/v1/chat/completions", ), ( "https://example.com/custom/path", "https://example.com/custom/path/v1/chat/completions", ), ( "https://example.com/custom/path/", "https://example.com/custom/path/v1/chat/completions", ), ] for input_url, expected_url in test_cases: result = _build_chat_completion_url(input_url) assert ( result == expected_url ), f"Failed for input: {input_url}, expected: {expected_url}, got: {result}" def test_validate_environment(self): """Test that the environment is validated correctly""" from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig config = HuggingFaceChatConfig() headers = config.validate_environment( headers={}, model="huggingface/fireworks-ai/meta-llama/Meta-Llama-3-8B-Instruct", messages=[{"role": "user", "content": "Hello"}], optional_params={}, api_key="test_api_key", litellm_params={}, ) assert headers["Authorization"] == "Bearer test_api_key" assert headers["content-type"] == "application/json" @pytest.mark.parametrize( "model, expected_model", [ ( "together/meta-llama/Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", ), ( "meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct", ), ], ) def test_transform_request(self, model, expected_model): from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig config = HuggingFaceChatConfig() messages = [{"role": "user", "content": "Hello"}] transformed_request = config.transform_request( model=model, messages=messages, optional_params={}, litellm_params={}, headers={}, ) assert transformed_request["model"] == expected_model assert transformed_request["messages"] == messages @pytest.mark.asyncio async def test_completion_cost(self): pass