|
import evaluate |
|
import lm_eval |
|
from typing import Union, List, Optional |
|
from dmx.compressor.dmx import config_rules, DmxModel |
|
import datasets |
|
import torch |
|
|
|
_DESCRIPTION = """ |
|
Evaluation function using lm-eval with d-Matrix integration. |
|
This function allows for the evaluation of language models across various tasks, |
|
with the option to use d-Matrix compressed models. |
|
""" |
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Args: |
|
model (str): The name or path of the model to evaluate. |
|
tasks (Union[str, List[str]]): The task or list of tasks to evaluate on. |
|
dmx_config (Optional[str]): Configuration string for d-Matrix transformations, defaults to None. |
|
num_fewshot (Optional[int]): Number of examples in few-shot context, defaults to None. |
|
batch_size (Optional[Union[int, str]]): Batch size for model, defaults to None. |
|
max_batch_size (Optional[int]): Maximum batch size to try with automatic batch size detection, defaults to None. |
|
limit (Optional[Union[int, float]]): Limit the number of examples per task, defaults to None. |
|
revision (str): Model revision to use, defaults to 'main'. |
|
trust_remote_code (bool): Whether to trust remote code, defaults to False. |
|
log_samples (bool): If True, logs all model outputs and documents, defaults to True. |
|
verbosity (str): Logging verbosity level, defaults to 'CRITICAL'. |
|
**kwargs: Additional keyword arguments to pass to `lm_eval.evaluate`. |
|
|
|
Returns: |
|
dict: A dictionary containing the evaluation results. |
|
""" |
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class DmxMetric(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
module_type="metric", |
|
description=_DESCRIPTION, |
|
citation="", |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
features=datasets.Features( |
|
{ |
|
"references": datasets.Value("string"), |
|
} |
|
), |
|
reference_urls=["https://github.com/EleutherAI/lm-evaluation-harness"], |
|
) |
|
|
|
def _compute( |
|
self, |
|
model: str, |
|
tasks: Union[str, List[str]], |
|
dmx_config: Optional[str] = None, |
|
num_fewshot: Optional[int] = None, |
|
batch_size: Optional[Union[int, str]] = None, |
|
max_batch_size: Optional[int] = None, |
|
limit: Optional[Union[int, float]] = None, |
|
device: Optional[str] = None, |
|
revision: str = "main", |
|
trust_remote_code: bool = False, |
|
log_samples: bool = True, |
|
verbosity: str = "INFO", |
|
**kwargs |
|
): |
|
""" |
|
Evaluate a model on multiple tasks and metrics using lm-eval with optional d-Matrix integration. |
|
""" |
|
if device is None: |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
model_args = f"pretrained={model},revision={revision},trust_remote_code={str(trust_remote_code)},device={device}" |
|
|
|
lm = lm_eval.api.registry.get_model("hf").create_from_arg_string( |
|
model_args, |
|
{ |
|
"batch_size": batch_size, |
|
"max_batch_size": max_batch_size, |
|
} |
|
) |
|
|
|
if dmx_config: |
|
lm._model = DmxModel.from_torch(lm._model) |
|
lm._model.transform(lm._model.dmx_config, *eval(f"config_rules.{dmx_config}")) |
|
|
|
task_dict = lm_eval.tasks.get_task_dict(tasks if isinstance(tasks, list) else [tasks]) |
|
|
|
for task in task_dict.values(): |
|
if num_fewshot is not None: |
|
task.set_config(key="num_fewshot", value=num_fewshot) |
|
|
|
eval_params = { |
|
'lm': lm, |
|
'task_dict': task_dict, |
|
'limit': limit, |
|
'log_samples': log_samples, |
|
'verbosity': verbosity, |
|
**kwargs |
|
} |
|
|
|
results = lm_eval.evaluate(**eval_params) |
|
return results.get('results', {}) |