Spaces:

duckdb-nsql-hub
/

DuckDB-SQL-Eval

Running

App Files Files Community

DuckDB-SQL-Eval / duckdb-nsql /manifest /manifest /manifest.py

tdoehmen

added duckdb-nsql benchmark setup

b247dc4 7 months ago

raw

history blame

28.9 kB

	"""Manifest class."""
	import asyncio
	import copy
	import logging
	from typing import (
	Any,
	Dict,
	Generator,
	Iterator,
	List,
	Optional,
	Tuple,
	Type,
	Union,
	cast,
	)

	import numpy as np

	from manifest.caches.noop import NoopCache
	from manifest.caches.postgres import PostgresCache
	from manifest.caches.redis import RedisCache
	from manifest.caches.sqlite import SQLiteCache
	from manifest.clients.client import Client
	from manifest.clients.huggingface import HuggingFaceClient
	from manifest.connections.client_pool import (
	CLIENT_CONSTRUCTORS,
	ClientConnection,
	ClientConnectionPool,
	)
	from manifest.request import LMChatRequest, LMScoreRequest, Request
	from manifest.response import ModelChoices, Response, Usage, Usages

	logging.getLogger("openai").setLevel(logging.WARNING)
	logger = logging.getLogger(__name__)


	CACHE_CONSTRUCTORS = {
	"redis": RedisCache,
	"sqlite": SQLiteCache,
	"noop": NoopCache,
	"postgres": PostgresCache,
	}


	class Manifest:
	"""Manifest session object."""

	def __init__(
	self,
	client_name: Optional[str] = None,
	client_connection: Optional[str] = None,
	client_pool: Optional[List[ClientConnection]] = None,
	client_pool_schedule: str = "round_robin",
	cache_name: str = "noop",
	cache_connection: Optional[str] = None,
	stop_token: str = "",
	**kwargs: Any,
	):
	"""
	Initialize manifest.

	Args:
	client_name: name of client.
	client_connection: connection string for client.
	client_pool: list of client connections for multi-client.
	client_pool_schedule: schedule for client pool.
	cache_name: name of cache.
	cache_connection: connection string for cache.
	stop_token: stop token prompt generation.
	Can be overridden in run

	Remaining kwargs sent to client and cache.
	"""
	if not client_name and not client_pool:
	raise ValueError(
	"Must specify client_name or client_pool. "
	f"Choices are {list(CLIENT_CONSTRUCTORS.keys())}"
	)
	if client_name and client_pool:
	raise ValueError("Cannot specify both client_name and client_pool")
	if client_name:
	client_pool = [
	ClientConnection(
	client_name=client_name,
	client_connection=client_connection,
	# Remove engine from kwargs
	engine=kwargs.pop("engine", None),
	)
	]
	self.client_pool = ClientConnectionPool(
	client_pool, client_pool_schedule, client_args=kwargs
	)
	if cache_name not in CACHE_CONSTRUCTORS:
	raise ValueError(
	f"Unknown cache name: {cache_name}. "
	f"Choices are {list(CACHE_CONSTRUCTORS.keys())}"
	)
	# Must pass kwargs as dict for client "pop" methods removed used arguments
	self.cache = CACHE_CONSTRUCTORS[cache_name]( # type: ignore
	cache_connection, self.client_pool.request_type, cache_args=kwargs
	)
	if len(kwargs) > 0:
	raise ValueError(f"{list(kwargs.items())} arguments are not recognized.")

	self.stop_token = stop_token

	def close(self) -> None:
	"""Close the client and cache."""
	self.client_pool.close()
	self.cache.close()

	def _validate_kwargs(self, kwargs: Dict, request_params: Request) -> None:
	"""Validate kwargs.

	Args:
	kwargs: kwargs to validate.
	request_params: request object to validate against.
	"""
	# Check for invalid kwargs
	non_request_kwargs = [
	(k, v) for k, v in kwargs.items() if k not in request_params.__dict__
	]
	if len(non_request_kwargs) > 0:
	raise ValueError(
	f"{list(non_request_kwargs)} arguments are not recognized."
	)

	# Warn for valid but unused kwargs
	request_unused_kwargs = [
	(k, v) for k, v in kwargs.items() if k not in non_request_kwargs
	]
	if len(request_unused_kwargs) > 0:
	logger.warning(f"{list(request_unused_kwargs)} arguments are unused.")
	return

	def _split_cached_requests(
	self,
	request: Request,
	client: Client,
	overwrite_cache: bool,
	) -> Tuple[Dict[int, Response], Request]:
	"""Split a request into cached responses and Requests to run.

	Args:
	request: request object.
	overwrite_cache: whether to overwrite cache.

	Returns:
	cached_idx_to_response: dict of cached responses.
	new_request: request object with only prompts to run.
	"""
	cached_idx_to_response: Dict[int, Response] = {}
	new_request = copy.deepcopy(request)
	if not overwrite_cache:
	if isinstance(new_request.prompt, list) and not isinstance(
	request, LMChatRequest
	):
	new_request.prompt = []
	for idx, prompt_str in enumerate(request.prompt):
	single_request = copy.deepcopy(request)
	single_request.prompt = prompt_str
	possible_response = self.cache.get(
	client.get_cache_key(single_request)
	)
	if possible_response:
	cached_idx_to_response[idx] = possible_response
	else:
	new_request.prompt.append(prompt_str)
	# Chat or single string requests are not broken down into
	# subprompts for caching.
	elif (isinstance(new_request.prompt, str)) or (
	isinstance(new_request.prompt, list)
	and isinstance(request, LMChatRequest)
	):
	possible_response = self.cache.get(client.get_cache_key(new_request))
	if possible_response:
	cached_idx_to_response[0] = possible_response
	new_request.prompt = None
	else:
	raise ValueError(
	f"Invalid prompt type: {type(new_request.prompt)}"
	f" with request type: {type(request)}"
	)
	return cached_idx_to_response, new_request

	def _stitch_responses_and_cache(
	self,
	request: Request,
	client: Client,
	response: Union[Response, None],
	cached_idx_to_response: Dict[int, Response],
	) -> Response:
	"""Stich together the cached and uncached responses."""
	# We stitch the responses (the choices) here from both the new request the
	# cached entries.
	all_model_choices = []
	all_usages = []
	all_input_prompts: List[Union[str, List[str], List[Dict]]] = []
	response_idx = 0
	number_prompts = len(cached_idx_to_response)
	single_completion_output = False
	if response:
	if isinstance(response.get_request_obj().prompt, str):
	single_completion_output = True
	number_prompts += 1
	elif isinstance(response.get_request_obj().prompt, list) and not isinstance(
	request, LMChatRequest
	):
	number_prompts += len(response.get_request_obj().prompt)
	elif isinstance(response.get_request_obj().prompt, list) and isinstance(
	request, LMChatRequest
	):
	assert len(cached_idx_to_response) <= 1
	number_prompts += 1
	else:
	raise ValueError(
	f"Invalid prompt type: {type(response.get_request_obj().prompt)}"
	f" with request type: {type(request)}"
	)
	response_type = None
	request_type: Type[Request] = None
	for idx in range(number_prompts):
	if idx in cached_idx_to_response:
	cached_res = cached_idx_to_response[idx]
	response_type = cached_res._response_type
	request_type = cached_res._request_type
	all_input_prompts.append(cached_res.get_request_obj().prompt)
	if request.n == 1:
	assert (
	len(cached_res.get_response_obj().choices) == 1
	), "cached response should have only one choice"
	all_model_choices.extend(cached_res.get_response_obj().choices)
	if cached_res.get_usage_obj().usages:
	all_usages.extend(cached_res.get_usage_obj().usages)
	else:
	assert response is not None, "response should not be None"
	response = cast(Response, response)
	response_type = response._response_type
	request_type = response._request_type
	# the choices list in the response is a flat one.
	# length is request.n * num_prompts
	current_choices = response.get_response_obj().choices[
	response_idx * request.n : (response_idx + 1) * request.n
	]
	all_model_choices.extend(current_choices)

	if isinstance(
	response.get_request_obj().prompt, list
	) and not isinstance(request, LMChatRequest):
	prompt: Union[
	str, List[str], List[Dict]
	] = response.get_request_obj().prompt[response_idx]
	# Chat request
	elif isinstance(response.get_request_obj().prompt, list) and isinstance(
	request, LMChatRequest
	):
	# We will only have response_idx == 0 here as we can only
	# support single chat requests.
	assert request.n == 1
	assert number_prompts <= 1
	prompt = response.get_request_obj().prompt
	else:
	prompt = str(response.get_request_obj().prompt)

	usages: Optional[List[Usage]] = None
	if response.get_usage_obj().usages:
	usages = response.get_usage_obj().usages[
	response_idx * request.n : (response_idx + 1) * request.n
	]
	all_usages.extend(usages)
	all_input_prompts.append(prompt)
	# set cache
	new_request = copy.deepcopy(request)
	new_request.prompt = prompt # type: ignore
	cache_key = client.get_cache_key(new_request)
	new_response = copy.deepcopy(response)
	new_response._response.choices = current_choices
	new_response._usages = Usages(usages=(usages or []))
	self.cache.set(cache_key, new_response.to_dict(drop_request=True))
	response_idx += 1

	new_request = copy.deepcopy(request)
	new_request.prompt = (
	all_input_prompts # type: ignore
	if len(all_input_prompts) > 1 or not single_completion_output
	else all_input_prompts[0]
	)
	response_obj = Response(
	response=ModelChoices(choices=all_model_choices),
	cached=len(cached_idx_to_response) > 0,
	request=new_request,
	usages=Usages(usages=all_usages),
	response_type=response_type,
	request_type=request_type,
	)
	return response_obj

	def run(
	self,
	prompt: Union[str, List[str], List[Dict[str, str]]],
	overwrite_cache: bool = False,
	stop_token: Optional[str] = None,
	return_response: bool = False,
	stream: bool = False,
	**kwargs: Any,
	) -> Union[
	str,
	List[str],
	np.ndarray,
	List[np.ndarray],
	Response,
	Iterator[str],
	Iterator[Response],
	]:
	"""
	Run the prompt.

	Orchestrates between the standard run and chat run and batch run.

	Args:
	prompt: prompt(s) to run.
	overwrite_cache: whether to overwrite cache.
	stop_token: stop token for prompt generation.
	Default is self.stop_token.
	"" for no stop token.
	return_response: whether to return Response object.
	stream: whether to stream the prompt. Only supported
	for single string prompts and LMs.

	Returns:
	response from prompt.
	"""
	if not isinstance(prompt, list) and not isinstance(prompt, str):
	raise ValueError(
	f"Invalid prompt type: {type(prompt)}. "
	"Prompt must be a string or list of strings "
	"or list of dicts."
	)
	if isinstance(prompt, list) and not prompt:
	raise ValueError("Prompt cannot be empty list")
	# Get the client to run
	client = self.client_pool.get_next_client()
	if stream:
	if not client.supports_streaming_inference():
	raise ValueError(
	f"Client {client} does not support streaming inference."
	)
	if not isinstance(prompt, str):
	raise ValueError(
	"Stream is only supported for single string prompts. "
	"It will soon be supported for chat dictionary prompts, too."
	)
	return self._run_stream(
	prompt=cast(str, prompt),
	client=client,
	overwrite_cache=overwrite_cache,
	stop_token=stop_token,
	return_response=return_response,
	**kwargs,
	)
	if isinstance(prompt, list) and isinstance(prompt[0], dict):
	if not client.IS_CHAT:
	raise ValueError(
	f"Client {client} does not support dict chat prompt. "
	"Please use a chat model."
	)
	if stop_token:
	logger.warning(
	"stop_token is not supported for chat prompt. "
	"Ignoring stop_token."
	)
	return self._run_chat(
	prompt=cast(List[Dict[str, str]], prompt),
	client=client,
	overwrite_cache=overwrite_cache,
	return_response=return_response,
	**kwargs,
	)
	return self._run(
	prompt=cast(Union[str, List[str]], prompt),
	client=client,
	overwrite_cache=overwrite_cache,
	stop_token=stop_token,
	return_response=return_response,
	**kwargs,
	)

	def _run(
	self,
	prompt: Union[str, List[str]],
	client: Client,
	overwrite_cache: bool = False,
	stop_token: Optional[str] = None,
	return_response: bool = False,
	**kwargs: Any,
	) -> Union[str, List[str], np.ndarray, List[np.ndarray], Response]:
	"""
	Run the prompt.

	Args:
	prompt: prompt(s) to run.
	client: client to run.
	overwrite_cache: whether to overwrite cache.
	stop_token: stop token for prompt generation.
	Default is self.stop_token.
	"" for no stop token.
	return_response: whether to return Response object.

	Returns:
	response from prompt.
	"""
	is_batch = isinstance(prompt, list)
	stop_token = stop_token if stop_token is not None else self.stop_token
	# Must pass kwargs as dict for client "pop" methods removed used arguments
	request_params = client.get_request(prompt, kwargs)
	# Avoid nested list of results - enforce n = 1 for batch
	if is_batch and request_params.n > 1:
	raise ValueError("Batch mode does not support n > 1.")
	self._validate_kwargs(kwargs, request_params)

	cached_idx_to_response, request_params = self._split_cached_requests(
	request_params, client, overwrite_cache
	)
	# If not None value or empty list - run new request
	if request_params.prompt:
	# Start timing metrics
	self.client_pool.start_timer()
	response = client.run_request(request_params)
	self.client_pool.end_timer()
	else:
	# Nothing to run
	response = None

	final_response = self._stitch_responses_and_cache(
	request=request_params,
	client=client,
	response=response,
	cached_idx_to_response=cached_idx_to_response,
	)
	# Extract text results
	if return_response:
	return final_response
	else:
	return final_response.get_response(stop_token, is_batch)

	def _run_chat(
	self,
	prompt: List[Dict[str, str]],
	client: Client,
	overwrite_cache: bool = False,
	return_response: bool = False,
	**kwargs: Any,
	) -> Union[str, Response]:
	"""
	Run the prompt.

	Args:
	prompt: prompt dictionary to run.
	client: client to run.
	overwrite_cache: whether to overwrite cache.
	stop_token: stop token for prompt generation.
	Default is self.stop_token.
	"" for no stop token.
	return_response: whether to return Response object.

	Returns:
	response from prompt.
	"""
	is_batch = False
	# Get a request for an empty prompt to handle all kwargs
	request_params = client.get_request("", kwargs)
	# Add prompt and cast as chat request
	request_params_dict = request_params.to_dict()
	request_params_dict["prompt"] = prompt
	request_params_as_chat = LMChatRequest(**request_params_dict)
	# Avoid nested list of results - enforce n = 1 for batch
	if request_params_as_chat.n > 1:
	raise ValueError("Chat mode does not support n > 1.")
	self._validate_kwargs(kwargs, request_params_as_chat)

	cached_idx_to_response, request_params_as_chat = self._split_cached_requests( # type: ignore # noqa: E501
	request_params_as_chat, client, overwrite_cache
	)
	# If not None value or empty list - run new request
	if request_params_as_chat.prompt:
	# Start timing metrics
	self.client_pool.start_timer()
	response = client.run_chat_request(request_params_as_chat)
	self.client_pool.end_timer()
	else:
	# Nothing to run
	response = None

	final_response = self._stitch_responses_and_cache(
	request=request_params_as_chat,
	client=client,
	response=response,
	cached_idx_to_response=cached_idx_to_response,
	)

	# Extract text results
	if return_response:
	return final_response
	else:
	return cast(str, final_response.get_response("", is_batch))

	def _run_stream(
	self,
	prompt: str,
	client: Client,
	overwrite_cache: bool = False,
	stop_token: Optional[str] = None,
	return_response: bool = False,
	**kwargs: Any,
	) -> Union[Generator[str, None, None], Generator[Response, None, None]]:
	"""
	Run the prompt in a stream.

	Args:
	prompt: prompt(s) to run.
	client: client to run.
	overwrite_cache: whether to overwrite cache.
	stop_token: stop token for prompt generation.
	Default is self.stop_token.
	"" for no stop token.
	return_response: whether to return Response object.

	Returns:
	response from prompt.
	"""
	is_batch = False
	stop_token = stop_token if stop_token is not None else self.stop_token
	# Must pass kwargs as dict for client "pop" methods removed used arguments
	request_params = client.get_request(prompt, kwargs)
	# Avoid nested list of results - enforce n = 1 for batch
	if request_params.n > 1:
	raise ValueError("Stream mode does not support n > 1.")
	self._validate_kwargs(kwargs, request_params)

	cached_idx_to_response, request_params = self._split_cached_requests(
	request_params, client, overwrite_cache
	)
	if request_params.prompt:
	# Because we are streaming, we should have either a cached response
	# a prompt to run
	assert len(cached_idx_to_response) == 0
	response_iter = client.run_streaming_request(request_params)
	is_cached = False
	else:
	assert len(cached_idx_to_response) == 1
	response_iter = cached_idx_to_response[0].as_iter()
	is_cached = True

	saved_responses = []
	# Start timing metrics
	self.client_pool.start_timer()
	for response_token in response_iter:
	saved_responses.append(response_token)
	if return_response:
	yield response_token
	else:
	yield cast(
	Union[str, Response], response_token.get_response("", is_batch)
	)
	self.client_pool.end_timer()

	if not is_cached:
	final_response = Response.union_all(
	saved_responses, as_single_lmchoice=True
	)
	self._stitch_responses_and_cache(
	request=request_params,
	client=client,
	response=final_response,
	cached_idx_to_response=cached_idx_to_response,
	)

	async def arun_batch(
	self,
	prompts: List[str],
	overwrite_cache: bool = False,
	stop_token: Optional[str] = None,
	return_response: bool = False,
	chunk_size: int = -1,
	verbose: bool = False,
	**kwargs: Any,
	) -> Union[List[str], List[np.ndarray], Response]:
	"""
	Run a batch of prompts with async.

	If the client pool is a single client, all prompts will be sent
	to one client and batch_size (which is passed it as kwargs) will
	determine how the prompts are split.

	If the client pool is a pool of clients, the prompts will be split
	into chunks and sent to the clients. Each client will split the
	chunk into batch_size prompts to send to the model.

	Args:
	prompts: prompts to run.
	overwrite_cache: whether to overwrite cache.
	stop_token: stop token for prompt generation.
	Default is self.stop_token.
	"" for no stop token.
	return_response: whether to return Response object.
	chunk_size: number of prompts to send to a client in chunks.
	For each chunk, the client will split the chunk into
	batch_sized prompts to send to the model.
	For a single manifest client, there is no impact to
	setting chunk_size. For a client pool, chunk_size
	can be used to distribute the load across the clients.
	verbose: whether to print progress of async tasks.

	Returns:
	response from prompt.
	"""
	if not isinstance(prompts, list):
	raise ValueError("Prompts must be a list of strings.")
	if not prompts:
	raise ValueError("Prompts must not be empty.")
	if not isinstance(prompts[0], str):
	raise ValueError("Prompts must be a list of strings.")

	# Split the prompts into chunks for connection pool
	prompt_chunks: List[Tuple[Client, List[str]]] = []
	if chunk_size > 0:
	for i in range(0, len(prompts), chunk_size):
	prompt_chunks.append(
	(self.client_pool.get_next_client(), prompts[i : i + chunk_size])
	)
	else:
	prompt_chunks = [(self.client_pool.get_next_client(), prompts)]

	# Run the chunks
	tasks = []
	for client, chunk in prompt_chunks:
	tasks.append(
	asyncio.create_task(
	self._arun_batch_client(
	prompts=chunk,
	client=client,
	overwrite_cache=overwrite_cache,
	verbose=verbose,
	**kwargs,
	)
	)
	)
	logger.info(f"Running {len(tasks)} tasks across all clients.")
	responses = await asyncio.gather(*tasks)
	final_response = Response.union_all(responses)
	stop_token = stop_token if stop_token is not None else self.stop_token

	# Extract text results
	if return_response:
	return final_response
	else:
	return cast(
	Union[List[str], List[np.ndarray]],
	final_response.get_response(stop_token, True),
	)

	async def _arun_batch_client(
	self,
	prompts: List[str],
	client: Client,
	overwrite_cache: bool = False,
	verbose: bool = False,
	**kwargs: Any,
	) -> Response:
	"""
	Run a batch of prompts with async for single client.

	Args:
	prompts: prompts to run.
	client: client to run.
	overwrite_cache: whether to overwrite cache.
	verbose: whether to print progress of async tasks.

	Returns:
	response from prompt.
	"""
	# Must pass kwargs as dict for client "pop" methods removed used arguments
	request_params = client.get_request(prompts, kwargs)
	# Avoid nested list of results - enforce n = 1 for batch
	if request_params.n > 1:
	raise ValueError("Batch mode does not support n > 1.")
	self._validate_kwargs(kwargs, request_params)

	cached_idx_to_response, request_params = self._split_cached_requests(
	request_params, client, overwrite_cache
	)
	# If not None value or empty list - run new request
	if request_params.prompt:
	self.client_pool.start_timer()
	response = await client.arun_batch_request(request_params, verbose=verbose)
	self.client_pool.end_timer()
	else:
	# Nothing to run
	response = None

	final_response = self._stitch_responses_and_cache(
	request=request_params,
	client=client,
	response=response,
	cached_idx_to_response=cached_idx_to_response,
	)
	return final_response

	def score_prompt(
	self,
	prompt: Union[str, List[str]],
	overwrite_cache: bool = False,
	**kwargs: Any,
	) -> Dict:
	"""
	Score the prompt via forward pass of the model - no sampling or generation.

	Returns the response object with logits of the prompt.

	Args:
	prompt: prompt(s) to run.
	overwrite_cache: whether to overwrite cache.

	Returns:
	response from prompt.
	"""
	client = self.client_pool.get_next_client()
	# Must pass kwargs as dict for client "pop" methods removed used arguments
	request_params = client.get_request(prompt, kwargs)
	request_params_as_score = LMScoreRequest(**request_params.to_dict())

	if request_params_as_score.n > 1:
	raise ValueError("Sequence scoring does not support n > 1.")
	self._validate_kwargs(kwargs, request_params_as_score)

	cached_idx_to_response, request_params_as_score = self._split_cached_requests( # type: ignore # noqa: E501
	request_params_as_score, client, overwrite_cache
	)
	# If not None value or empty list - run new request
	if request_params_as_score.prompt:
	try:
	response = cast(HuggingFaceClient, client).run_score_prompt_request(
	request_params_as_score
	)
	except AttributeError:
	raise ValueError("`score_prompt` only supported for HF models.")
	else:
	# Nothing to run
	response = None

	final_response = self._stitch_responses_and_cache(
	request=request_params_as_score,
	client=client,
	response=response,
	cached_idx_to_response=cached_idx_to_response,
	)
	return final_response.to_dict()