Spaces:

unitxt
/

metric

Running

App Files Files Community

Elron commited on 4 days ago

Commit

e5808d4

verified ·

1 Parent(s): 5b41acf

Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

base_metric.py +229 -0
benchmark.py +15 -0
dataset.py +1 -0
evaluate_cli.py +6 -8
fusion.py +14 -2
image_operators.py +5 -0
inference.py +83 -6
llm_as_judge.py +1 -1
llm_as_judge_constants.py +93 -14
loaders.py +127 -54
metric.py +1 -0
metrics.py +67 -215
operators.py +76 -70
version.py +1 -1

base_metric.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from abc import abstractmethod
+from typing import (
+    Any,
+    Dict,
+    List,
+    Union,
+)
+from .artifact import Artifact
+from .dataclass import (
+    AbstractField,
+)
+from .deprecation_utils import deprecation
+from .error_utils import Documentation, UnitxtWarning
+from .stream import Stream
+from .type_utils import Type, isoftype, parse_type_string, to_type_string
+@deprecation(
+    version="2.0.0",
+    msg="use regular type instead of strings (e.g Dict[str] instead of 'Dict[str]')",
+)
+def parse_string_types_instead_of_actual_objects(obj):
+    return parse_type_string(obj)
+class Metric(Artifact):
+    main_score: str = AbstractField()
+    # Override 'prediction_type' with the expected type of predictions
+    # and references.  Example: "List[str]", "List[Dict]"", "string".
+    # If left with default None, a warning will be displayed.
+    # In future versions of unitxt, this will be an error.
+    prediction_type: Union[Type, str] = Any
+    # Standard metrics can receive multiple references per predictions (in a list)
+    # Some metrics support only a single reference per prediction (one element in the list)
+    single_reference_per_prediction: bool = False
+    #
+    # Used to add a prefix to all score, except the "score_name" and "score" fields.
+    # This is used to distinguish two scores of the same metrics, operating on different fields of the task
+    #
+    score_prefix: str = ""
+    def prepare_args(self):
+        super().prepare_args()
+        if isinstance(self.prediction_type, str):
+            self.prediction_type = parse_string_types_instead_of_actual_objects(
+                self.prediction_type
+            )
+    @classmethod
+    def process_data_after_load(cls, data):
+        if "prediction_type" in data:
+            data["prediction_type"] = parse_type_string(data["prediction_type"])
+        return data
+    def process_data_before_dump(self, data):
+        if "prediction_type" in data:
+            if not isinstance(data["prediction_type"], str):
+                data["prediction_type"] = to_type_string(data["prediction_type"])
+        return data
+    def _add_score_prefix(self, score_name):
+        return (
+            self.score_prefix + score_name
+            if score_name not in ["score", "score_name", "num_of_instances"]
+            else score_name
+        )
+    def _add_score_prefixes_to_score_dict_and_check_against_existing_scores(
+        self, scores: Dict[str, Any], existing_scores: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        new_scores = {}
+        for score_name, score in scores.items():
+            score_with_prefix = self._add_score_prefix(score_name)
+            new_scores[score_with_prefix] = (
+                score if score_name not in ["score_name"] else self.score_prefix + score
+            )
+        for new_score_name in new_scores:
+            if new_score_name in ["score", "score_name", "num_of_instances"]:
+                continue
+            if new_score_name in existing_scores:
+                UnitxtWarning(
+                    message=f"Metric '{new_score_name}' that has just been evaluated to {new_scores[new_score_name]}, is already recorded "
+                    f"to have value {existing_scores[new_score_name]} by a previous metric evaluation on this instance or stream. "
+                    f"To avoid overwriting the existing value, add a score_prefix to the metric name (e.g. score_prefix='my_second_' , "
+                    f"which will yield, in this case, a score named: 'my_second_{new_score_name}')",
+                    additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
+                )
+        return new_scores
+    def _validate_references_and_prediction(self, references, predictions):
+        if not isoftype(predictions, List[Any]):
+            raise ValueError(
+                f"Metric {self.get_metric_name()} should receive a list of predictions {self.get_metric_name()}.  Received predictions of type {type(predictions)}: {predictions}"
+            )
+        if not isoftype(references, List[Any]):
+            raise ValueError(
+                f"Metric {self.get_metric_name()} should receive a list of predictions. Received references of type {type(references)}: {references}"
+            )
+        if len(references) != len(predictions):
+            raise ValueError(
+                f"references size ({len(references)})"
+                f" doesn't mach predictions size ({len(references)})."
+            )
+        for reference in references:
+            self._validate_reference(reference)
+        for prediction in predictions:
+            self._validate_prediction(prediction)
+    def _validate_prediction(self, prediction):
+        if not isoftype(prediction, self.prediction_type):
+            raise ValueError(
+                f"Each prediction is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received prediction of type {type(prediction)}: {prediction}"
+            )
+    def _validate_reference(self, reference):
+        if not isoftype(reference, List[Any]):
+            raise ValueError(
+                f"Expecting a list of references for each prediction in {self.get_metric_name()} metric. Received reference of type {type(reference)}: {reference}"
+            )
+        if self.single_reference_per_prediction and not len(reference) == 1:
+            raise ValueError(
+                f"Expecting a list with a single reference per prediction in {self.get_metric_name()} metric. Received a list with multiple references: {reference}"
+            )
+        for ref in reference:
+            if not isoftype(ref, self.prediction_type):
+                raise ValueError(
+                    f"Each reference is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received reference of type {type(ref)}: {ref}"
+                )
+    def get_metric_name(self):
+        if self.__id__ is not None:
+            return self.__id__
+        return self.__class__.__name__
+    def consume_stream(self, stream: Stream):
+        references = []
+        predictions = []
+        additional_inputs = []
+        instances = []
+        for instance in stream:
+            instance = self.verify_instance(instance)
+            references.append(instance["references"])
+            predictions.append(instance["prediction"])
+            additional_inputs.append(
+                instance["additional_inputs"] if "additional_inputs" in instance else {}
+            )
+            instances.append(instance)
+        return predictions, references, additional_inputs, instances
+    @staticmethod
+    def update_instance_scores(instances, instances_scores: List[Dict[str, Any]]):
+        for instance, new_scores in zip(instances, instances_scores):
+            if "score" not in instance:
+                instance["score"] = {}
+            scores = instance["score"]
+            if "instance" not in scores:
+                scores["instance"] = {}
+            scores["instance"].update(new_scores)
+    @staticmethod
+    def set_global_score(instances, global_score: Dict[str, Any]):
+        for instance in instances:
+            if "score" not in instance:
+                instance["score"] = {}
+            scores = instance["score"]
+            if "global" not in scores:
+                scores["global"] = {}
+            scores["global"] = global_score
+    @abstractmethod
+    def disable_confidence_interval_calculation(self):
+        pass
+    # update instance["score"]["global"] with the global_score just computed for the
+    # current metric.  global_score contains "score" and "score_name" fields that reflect
+    # (the main_score of) the current metric. If CI was computed for global_score, then global_score
+    # also contains "score_ci_low" and "score_ci_high" that reflect (the main_score of) the current metric.
+    # A simple python-dictionary-update adds new fields to instance["score"]["global"], and also replaces the values
+    # of its fields "score" and "score_name" (and "score_ci_low", "score_ci_high" if applicable),
+    # to reflect the current metric, overwriting previous metrics' settings of these fields
+    # (if any previous metric exists).
+    # When global_score does NOT contain ci score (because CI was not computed for the current metric), but
+    # one of the previous metrics computed did have, the last of such previous metrics set the values in
+    # fields "score_ci_low" and "score_ci_high" in instance["score"]["global"] to reflect its
+    # (the previous metric's) CI scores.
+    # Because CI is not computed for the current metric, global_score does not contain fields "score_ci_low" and
+    # "score_ci_high" to overwrite the ones existing in instance["score"]["global"], and these might remain in
+    # instance["score"]["global"], but their values, that are not associated with the current metric, are,
+    # therefore, not consistent with "score_name".
+    # In such a case, following the python-dictionary-update, we pop out fields "score_ci_low" and
+    # "score_ci_high" from instance["score"]["global"], so that now all the fields "score.." in
+    # instance["score"]["global"] are consistent with the current metric: The metric that is named
+    # instance["score"]["global"]["score_name"], its score shows in
+    # field instance["score"]["global"]["score"], and it does not have ci_scores,
+    # which is also reflected in the absence of fields "score_ci_low" and "score_ci_high" from instance["score"]["global"].
+    # If ci IS computed for the current metric, global_score contains "score_ci_low" and "score_ci_high", and these overwrite
+    # the ones existing in instance["score"]["global"] by the simple python-dictionary-update, and no need for any further fixeup.
+    def update_and_adjust_global_score(
+        self, instance: Dict[str, Any], global_score: dict
+    ):
+        for score_name in global_score:
+            if score_name in [
+                "score",
+                "score_name",
+                "score_ci_low",
+                "score_ci_high",
+                "num_of_instances",
+            ]:
+                continue
+            if score_name in instance["score"]["global"]:
+                UnitxtWarning(
+                    message=f"Global metric '{score_name}' that has just been evaluated to {global_score[score_name]}, is already recorded "
+                    f"to have value {instance['score']['global'][score_name]} by a previous metric evaluation on this stream. "
+                    f"To avoid overwriting the value, add a score_prefix to the metric (e.g. score_prefix='my_{score_name}'.",
+                    additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
+                )
+        instance["score"]["global"].update(global_score)
+        for score_ci in ["score_ci_low", "score_ci_high"]:
+            if score_ci in global_score:
+                continue
+            if score_ci in instance["score"]["global"]:
+                instance["score"]["global"].pop(score_ci)

benchmark.py CHANGED Viewed

@@ -30,6 +30,9 @@ class Benchmark(BaseBenchmark):
     max_total_samples: int = None
     max_samples_per_subset: int = None
     def verify(self):
         super().verify()
@@ -73,10 +76,22 @@ class Benchmark(BaseBenchmark):
             subsets = {self.subset: self.subsets[self.subset]}
         else:
             subsets = self.subsets
         if self.max_total_samples is None:
             operator = FixedFusion(
                 subsets=subsets,
                 max_instances_per_subset=self.max_samples_per_subset,
                 include_splits=self.splits,
             )
         else:

     max_total_samples: int = None
     max_samples_per_subset: int = None
+    max_train_instances: int = None
+    max_validation_instances: int = None
+    max_test_instances: int = None
     def verify(self):
         super().verify()
             subsets = {self.subset: self.subsets[self.subset]}
         else:
             subsets = self.subsets
+        max_instances_per_split = {}
+        if self.max_train_instances is not None:
+            max_instances_per_split["train"] = self.max_train_instances
+        if self.max_validation_instances is not None:
+            max_instances_per_split["validation"] = self.max_validation_instances
+        if self.max_test_instances is not None:
+            max_instances_per_split["test"] = self.max_test_instances
+        if len(max_instances_per_split) == 0:
+            max_instances_per_split = None
         if self.max_total_samples is None:
             operator = FixedFusion(
                 subsets=subsets,
                 max_instances_per_subset=self.max_samples_per_subset,
+                max_instances_per_split=max_instances_per_split,
                 include_splits=self.splits,
             )
         else:

dataset.py CHANGED Viewed

@@ -6,6 +6,7 @@ import datasets
 from .api import __file__ as _
 from .artifact import __file__ as _
 from .augmentors import __file__ as _
 from .benchmark import __file__ as _
 from .blocks import __file__ as _
 from .card import __file__ as _

 from .api import __file__ as _
 from .artifact import __file__ as _
 from .augmentors import __file__ as _
+from .base_metric import __file__ as _
 from .benchmark import __file__ as _
 from .blocks import __file__ as _
 from .card import __file__ as _

evaluate_cli.py CHANGED Viewed

@@ -13,7 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
 from datasets import Dataset as HFDataset
-from .api import evaluate, load_dataset
 from .artifact import UnitxtArtifactNotFoundError
 from .benchmark import Benchmark
@@ -27,7 +27,6 @@ from .logging_utils import get_logger
 from .metric_utils import EvaluationResults
 from .parsing_utils import parse_key_equals_value_string_to_dict
 from .settings_utils import settings
-from .standard import DatasetRecipe
 # Define logger early so it can be used in initial error handling
 # Basic config for initial messages, will be reconfigured in main()
@@ -294,21 +293,20 @@ def cli_load_dataset(args: argparse.Namespace) -> HFDataset:
     benchmark_subsets = {}
     for task_str in args.tasks:
-        dataset_args = task_str_to_dataset_args(task_str, args)
-        benchmark_subsets[task_str] = DatasetRecipe(**dataset_args)
     benchmark = Benchmark(subsets=benchmark_subsets)
-    test_dataset = load_dataset(benchmark, split=args.split)
     logger.info(
         f"Dataset loaded successfully. Number of instances: {len(test_dataset)}"
     )
     return test_dataset
-def task_str_to_dataset_args(task_str, args):
-    dataset_args = parse_key_equals_value_string_to_dict(task_str)
     if args.limit is not None:
         assert f"max_{args.split}_instances" not in dataset_args, (

 from datasets import Dataset as HFDataset
+from .api import _source_to_dataset, evaluate, load_recipe
 from .artifact import UnitxtArtifactNotFoundError
 from .benchmark import Benchmark
 from .metric_utils import EvaluationResults
 from .parsing_utils import parse_key_equals_value_string_to_dict
 from .settings_utils import settings
 # Define logger early so it can be used in initial error handling
 # Basic config for initial messages, will be reconfigured in main()
     benchmark_subsets = {}
     for task_str in args.tasks:
+        overwrite_args = extract_overwrite_args(args)
+        benchmark_subsets[task_str] = load_recipe(dataset_query=task_str, **overwrite_args)
     benchmark = Benchmark(subsets=benchmark_subsets)
+    test_dataset = _source_to_dataset(benchmark, split=args.split)
     logger.info(
         f"Dataset loaded successfully. Number of instances: {len(test_dataset)}"
     )
     return test_dataset
+def extract_overwrite_args(args):
+    dataset_args = {}
     if args.limit is not None:
         assert f"max_{args.split}_instances" not in dataset_args, (

fusion.py CHANGED Viewed

@@ -67,6 +67,7 @@ class FixedFusion(BaseFusion):
     """
     max_instances_per_subset: Optional[int] = None
     def prepare(self):
         super().prepare()
@@ -78,12 +79,23 @@ class FixedFusion(BaseFusion):
             if split not in multi_stream:
                 continue
             emitted_from_this_split = 0
             logger.info(f"Processing {split} from {origin_name}...")
             try:
                 for instance in multi_stream[split]:
                     if (
-                        self.max_instances_per_subset is not None
-                        and emitted_from_this_split >= self.max_instances_per_subset
                     ):
                         break
                     if isinstance(origin_name, str):

     """
     max_instances_per_subset: Optional[int] = None
+    max_instances_per_split: Optional[Dict[str, int]]= None
     def prepare(self):
         super().prepare()
             if split not in multi_stream:
                 continue
             emitted_from_this_split = 0
+            max_from_this_split = None
+            if self.max_instances_per_subset is not None:
+                max_from_this_split = self.max_instances_per_subset
+            if self.max_instances_per_split is not None:
+                max_per_this_split = self.max_instances_per_split.get(split)
+                if max_per_this_split is not None:
+                    if max_from_this_split is None:
+                        max_from_this_split = max_per_this_split
+                    elif max_per_this_split < max_from_this_split:
+                        max_from_this_split = max_per_this_split
             logger.info(f"Processing {split} from {origin_name}...")
             try:
                 for instance in multi_stream[split]:
                     if (
+                        max_from_this_split is not None
+                        and emitted_from_this_split >= max_from_this_split
                     ):
                         break
                     if isinstance(origin_name, str):

image_operators.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import base64
 import io
 import re
 from abc import abstractmethod
@@ -113,6 +114,10 @@ class EncodeImageToString(FieldOperator):
     def process_value(self, value: Any) -> Any:
         return {"image": self.encode_image_to_base64(value)}
 class DecodeImage(FieldOperator, PillowMixin):
     def process_value(self, value: str) -> Any:

 import base64
+import hashlib
 import io
 import re
 from abc import abstractmethod
     def process_value(self, value: Any) -> Any:
         return {"image": self.encode_image_to_base64(value)}
+class HashImage(FieldOperator, PillowMixin):
+    def process_value(self, value: Any) -> Any:
+        return hashlib.md5(value.tobytes()).hexdigest()
 class DecodeImage(FieldOperator, PillowMixin):
     def process_value(self, value: str) -> Any:

inference.py CHANGED Viewed

@@ -6,6 +6,7 @@ import hashlib
 import io
 import json
 import logging
 import os
 import re
 import sys
@@ -35,6 +36,7 @@ from tqdm import tqdm, trange
 from tqdm.asyncio import tqdm_asyncio
 from .artifact import Artifact
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
 from .error_utils import UnitxtError, UnitxtWarning
@@ -238,7 +240,7 @@ class InferenceEngine(Artifact):
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
-                number_of_batches = len(dataset) // self.cache_batch_size + 1
                 result = []
                 for batch_index, batch in enumerate(
                     batched(dataset, self.cache_batch_size)
@@ -3342,10 +3344,12 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
         "watsonx-sdk": {  # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
             "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
-            "granite-3-2-8b-instruct": "ibm/granite-3-2-8b-instruct",
-            "granite-3-3-8b-instruct": "ibm/granite-3-3-8b-instruct",
             "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
             "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
             "granite-34b-code-instruct": "ibm/granite-34b-code-instruct",
             "granite-guardian-3-8b": "ibm/granite-guardian-3-8b",
             "granite-vision-3-2-2b": "ibm/granite-vision-3-2-2b",
@@ -3361,7 +3365,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "mistral-large-instruct": "mistralai/mistral-large",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
         },
-        "together-ai": {
             "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
             "llama-3-70b-instruct": "together_ai/meta-llama/Llama-3-70b-chat-hf",
             "llama-3-1-8b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
@@ -3369,10 +3373,23 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-1-405b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
             "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
             "llama-3-3-70b-instruct": "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo",
         },
-        "aws": {
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
             "llama-3-70b-instruct": "bedrock/meta.llama3-70b-instruct-v1:0",
         },
         "ollama": {
             "llama-3-8b-instruct": "llama3:8b",
@@ -3383,6 +3400,8 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-2-1b-instruct": "llama3.2:1b",
             "llama-3-2-3b-instruct": "llama3.2:3b",
             "llama-3-3-70b-instruct": "llama3.3",
         },
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
@@ -3401,9 +3420,12 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
-            "deepseek-v3": "deepseek-ai/DeepSeek-V3",
             "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
         },
@@ -3432,6 +3454,12 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "gpt-4-32k-0314": "gpt-4-32k-0314",
             "gpt-4-32k-0613": "gpt-4-32k-0613",
             "gpt-4-vision-preview": "gpt-4-vision-preview",
         },
         "azure": {
             "o1-mini": "azure/o1-mini",
@@ -3454,11 +3482,23 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "gpt-3.5-turbo-16k": "azure/gpt-3.5-turbo-16k",
             "gpt-3.5-turbo-16k-0613": "azure/gpt-3.5-turbo-16k-0613",
             "gpt-4-vision": "azure/gpt-4-vision",
         },
         "vertex-ai": {
             "llama-3-1-8b-instruct": "vertex_ai/meta/llama-3.1-8b-instruct-maas",
             "llama-3-1-70b-instruct": "vertex_ai/meta/llama-3.1-70b-instruct-maas",
             "llama-3-1-405b-instruct": "vertex_ai/meta/llama-3.1-405b-instruct-maas",
         },
         "replicate": {
             "granite-3-2-8b-instruct": "replicate/ibm-granite/granite-3.2-8b-instruct",
@@ -3480,9 +3520,13 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
             "llama-3-70b-instruct": "replicate/meta/meta-llama-3-70b-instruct",
             "llama-3-8b": "replicate/meta/meta-llama-3-8b",
             "llama-3-8b-instruct": "replicate/meta/meta-llama-3-8b-instruct",
             "mistral-7b-instruct-v0.2": "replicate/mistralai/mistral-7b-instruct-v0.2",
             "mistral-7b-v0.1": "replicate/mistralai/mistral-7b-v0.1",
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
         },
     }
     provider_model_map["watsonx"] = {
@@ -3516,6 +3560,7 @@ class CrossProviderInferenceEngine(InferenceEngine, StandardAPIParamsMixin):
         return self.provider if self.provider is not None else settings.default_provider
     def prepare_engine(self):
         provider = self.get_provider_name()
         if provider not in self._provider_to_base_class:
             raise UnitxtError(
@@ -3675,3 +3720,35 @@ class HFOptionSelectingInferenceEngine(InferenceEngine, TorchDeviceMixin):
             predictions.append(options_scores.most_common(1)[0][0])
         return predictions

 import io
 import json
 import logging
+import math
 import os
 import re
 import sys
 from tqdm.asyncio import tqdm_asyncio
 from .artifact import Artifact
+from .base_metric import Metric
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
 from .error_utils import UnitxtError, UnitxtWarning
             result = self._mock_infer(dataset)
         else:
             if self.use_cache:
+                number_of_batches = math.ceil(len(dataset) / self.cache_batch_size)
                 result = []
                 for batch_index, batch in enumerate(
                     batched(dataset, self.cache_batch_size)
     provider_model_map: Dict[_supported_apis, Dict[str, str]] = {
         "watsonx-sdk": {  # checked from ibm_watsonx_ai.APIClient().foundation_models.ChatModels
             "granite-20b-code-instruct": "ibm/granite-20b-code-instruct",
             "granite-3-2b-instruct": "ibm/granite-3-2b-instruct",
             "granite-3-8b-instruct": "ibm/granite-3-8b-instruct",
+            "granite-3-2-2b-instruct": "ibm/granite-3-2-2b-instruct",
+            "granite-3-2-8b-instruct": "ibm/granite-3-2-8b-instruct",
+            "granite-3-3-2b-instruct": "ibm/granite-3-3-2b-instruct",
+            "granite-3-3-8b-instruct": "ibm/granite-3-3-8b-instruct",
             "granite-34b-code-instruct": "ibm/granite-34b-code-instruct",
             "granite-guardian-3-8b": "ibm/granite-guardian-3-8b",
             "granite-vision-3-2-2b": "ibm/granite-vision-3-2-2b",
             "mistral-large-instruct": "mistralai/mistral-large",
             "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7b-instruct-v01",
         },
+        "together-ai": { # checked from https://www.together.ai/models
             "llama-3-8b-instruct": "together_ai/meta-llama/Llama-3-8b-chat-hf",
             "llama-3-70b-instruct": "together_ai/meta-llama/Llama-3-70b-chat-hf",
             "llama-3-1-8b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
             "llama-3-1-405b-instruct": "together_ai/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
             "llama-3-2-1b-instruct": "together_ai/togethercomputer/llama-3-2-1b-instruct",
             "llama-3-3-70b-instruct": "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo",
+            "llama-4-maverick": "together_ai/meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", #pragma: allowlist secret
+            "llama-4-scout": "together_ai/meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            "deepseek-v3": "together_ai/deepseek-ai/DeepSeek-V3",
+            "llama-3-3-70b-instruct-free": "together_ai/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
+            "deepseek-r1-distilled-llama-70b-free": "together_ai/deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
         },
+        "aws": { # checked from https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
             "llama-3-8b-instruct": "bedrock/meta.llama3-8b-instruct-v1:0",
             "llama-3-70b-instruct": "bedrock/meta.llama3-70b-instruct-v1:0",
+            "llama-3-1-70b-instruct": "bedrock/meta.llama3-1-70b-instruct-v1:0",
+            "llama-3-1-405b-instruct": "bedrock/meta.llama3-1-405b-instruct-v1:0",
+            "llama-3-3-70b-instruct": "bedrock/meta.llama3-3-70b-instruct-v1:0",
+            "llama-4-maverick": "bedrock/meta.llama4-maverick-17b-instruct-v1:0", #pragma: allowlist secret
+            "llama-4-scout": "bedrock/meta.llama4-scout-17b-instruct-v1:0",
+            "mistral-large-instruct": "bedrock/mistral.mistral-large-2407-v1:0",
+            "deepseek-r1": "bedrock/deepseek.r1-v1:0",
+            "claude-3-7-sonnet": "bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0",
         },
         "ollama": {
             "llama-3-8b-instruct": "llama3:8b",
             "llama-3-2-1b-instruct": "llama3.2:1b",
             "llama-3-2-3b-instruct": "llama3.2:3b",
             "llama-3-3-70b-instruct": "llama3.3",
+            "granite-3-3-2b-instruct": "granite3.3:2b",
+            "granite-3-3-8b-instruct": "granite3.3:8b",
         },
         "bam": {
             "granite-3-8b-instruct": "ibm/granite-8b-instruct-preview-4k",
             "llama-3-2-11b-vision-instruct": "meta-llama/Llama-3.2-11B-Vision-Instruct",
             "llama-3-2-90b-vision-instruct": "meta-llama/Llama-3.2-90B-Vision-Instruct",
             "llama-3-3-70b-instruct": "meta-llama/llama-3-3-70b-instruct",
+            "llama-4-scout": "llama-4-scout-17b-16e",
+            "llama-4-maverick": "llama-4-mvk-17b-128e-fp8",
             "mistral-large-instruct": "mistralai/mistral-large-instruct-2407",
             "mixtral-8x7b-instruct": "mistralai/mixtral-8x7B-instruct-v0.1",
+            "mixtral-8x7b-instruct-v01": "mistralai/mixtral-8x7B-instruct-v0.1",
+            "deepseek-v3": "deepseek-ai/deepseek-v3-h200",
             "granite-guardian-3-2-3b-a800m": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
         },
             "gpt-4-32k-0314": "gpt-4-32k-0314",
             "gpt-4-32k-0613": "gpt-4-32k-0613",
             "gpt-4-vision-preview": "gpt-4-vision-preview",
+            "gpt-4-1": "gpt-4.1",
+            "gpt-4-1-2025-04-14": "gpt-4.1-2025-04-14",
+            "gpt-4-1-nano": "gpt-4.1-nano",
+            "gpt-4-1-nano-2025-04-14": "gpt-4.1-nano-2025-04-14",
+            "gpt-4-1-mini": "gpt-4.1-mini",
+            "gpt-4-1-mini-2025-04-14": "gpt-4.1-mini-2025-04-14",
         },
         "azure": {
             "o1-mini": "azure/o1-mini",
             "gpt-3.5-turbo-16k": "azure/gpt-3.5-turbo-16k",
             "gpt-3.5-turbo-16k-0613": "azure/gpt-3.5-turbo-16k-0613",
             "gpt-4-vision": "azure/gpt-4-vision",
+            "gpt-4-1": "azure/gpt-4.1",
+            "gpt-4-1-nano": "azure/gpt-4.1-nano",
+            "gpt-4-1-mini": "azure/gpt-4.1-mini",
+            "gpt-4-1-mini-2025-04-14": "azure/gpt-4.1-mini-2025-04-14",
+            "llama-3-1-405b-instruct": "azure/Meta-Llama-3.1-405B-Instruct",
+            "llama-3-3-70b-instruct": "azure/Llama-3.3-70B-Instruct",
+            "llama-4-maverick": "azure/Llama-4-Maverick-17B-128E-Instruct-FP8", #pragma: allowlist secret
+            "llama-4-scout": "azure/Llama-4-Scout-17B-16E-Instruct",
         },
         "vertex-ai": {
             "llama-3-1-8b-instruct": "vertex_ai/meta/llama-3.1-8b-instruct-maas",
             "llama-3-1-70b-instruct": "vertex_ai/meta/llama-3.1-70b-instruct-maas",
             "llama-3-1-405b-instruct": "vertex_ai/meta/llama-3.1-405b-instruct-maas",
+            "gemini-2-5-pro": "vertex_ai/gemini-2.5-pro-preview-05-06",
+            "gemini-2-5-pro-preview-05-06": "vertex_ai/gemini-2.5-pro-preview-05-06",
+            "gemini-2.5-flash": "gemini-2.5-flash-preview-05-20",
+            "gemini-2.5-flash-preview-05-20": "gemini-2.5-flash-preview-05-20",
         },
         "replicate": {
             "granite-3-2-8b-instruct": "replicate/ibm-granite/granite-3.2-8b-instruct",
             "llama-3-70b-instruct": "replicate/meta/meta-llama-3-70b-instruct",
             "llama-3-8b": "replicate/meta/meta-llama-3-8b",
             "llama-3-8b-instruct": "replicate/meta/meta-llama-3-8b-instruct",
+            "llama-3-3-70b-instruct": "replicate/meta/meta-llama-3.3-70b-instruct",
+            "llama-4-maverick": "replicate/meta/llama-4-maverick-instruct",
+            "llama-4-scout": "replicate/meta/llama-4-scout-instruct",
             "mistral-7b-instruct-v0.2": "replicate/mistralai/mistral-7b-instruct-v0.2",
             "mistral-7b-v0.1": "replicate/mistralai/mistral-7b-v0.1",
             "mixtral-8x7b-instruct-v0.1": "replicate/mistralai/mixtral-8x7b-instruct-v0.1",
+            "gpt-4-1": "replicate/openai/gpt-4.1",
         },
     }
     provider_model_map["watsonx"] = {
         return self.provider if self.provider is not None else settings.default_provider
     def prepare_engine(self):
+        # print("provider", self.provider)
         provider = self.get_provider_name()
         if provider not in self._provider_to_base_class:
             raise UnitxtError(
             predictions.append(options_scores.most_common(1)[0][0])
         return predictions
+class MetricInferenceEngine(InferenceEngine):
+    """An inference engine that uses the output of a metric as its prediction. Used to evaluate metrics like LLM as Judge or Granite Guardian.
+    Args:
+        InferenceEngine (_type_): _description_
+    """
+    metric: Metric
+    prediction_field: str
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        task_data = [
+            json.loads(instance["task_data"]) if "task_data" in instance else {}
+            for instance in dataset
+        ]
+        predictions=[td[self.prediction_field] for td in task_data]
+        references = [instance["references"] for instance in dataset]
+        return self.metric.compute(
+            task_data=task_data,
+            predictions=predictions,
+            references=references,
+        )
+    def prepare_engine(self):
+        pass
+    def get_engine_id(self):
+        return "metric_inference_engine"

llm_as_judge.py CHANGED Viewed

@@ -251,7 +251,7 @@ class LLMJudgeDirect(LLMJudge):
         self.assessment_task = Task(
             input_fields={
                 "context_variables": str,
-                "response": str,
                 "criteria_description": str,
                 "display_options_instruction": str,
             },

         self.assessment_task = Task(
             input_fields={
                 "context_variables": str,
+                "response": Any,
                 "criteria_description": str,
                 "display_options_instruction": str,
             },

llm_as_judge_constants.py CHANGED Viewed

@@ -71,8 +71,13 @@ class EvaluatorNameEnum(str, Enum):
     LLAMA3_1_70B = "Llama3.1-70b"
     LLAMA3_2_3B = "Llama3.2-3b"
     LLAMA3_3_70B = "Llama3.3-70b"
     PROMETHEUS = "Prometheus"
-    GPT4 = "GPT-4o"
     O1_PREVIEW = "o1-Preview"
     O1_MINI = "o1-Mini"
     GRANITE_13B = "Granite-13b"
@@ -81,23 +86,36 @@ class EvaluatorNameEnum(str, Enum):
     GRANITE3_1_2B = "Granite3.1-2b"
     GRANITE3_1_8B = "Granite3.1-8b"
     GRANITE3_2_8B = "Granite3.2-8b"
 class ModelProviderEnum(str, Enum):
     WATSONX = "watsonx"
     OPENAI = "open-ai"
     RITS = "rits"
-    AZURE_OPENAI = "azure"
 EVALUATOR_TO_MODEL_ID = {
     EvaluatorNameEnum.MIXTRAL8_7b: "mixtral-8x7b-instruct-v01",
     EvaluatorNameEnum.MIXTRAL_LARGE: "mistral-large-instruct",
     EvaluatorNameEnum.LLAMA3_1_405B: "llama-3-1-405b-instruct",
-    EvaluatorNameEnum.LLAMA3_1_8B: "llama-3-1-70b-instruct",
     EvaluatorNameEnum.LLAMA3_1_70B: "llama-3-1-70b-instruct",
     EvaluatorNameEnum.LLAMA3_3_70B: "llama-3-3-70b-instruct",
-    EvaluatorNameEnum.GPT4: "gpt-4o-2024-08-06",
     EvaluatorNameEnum.O1_PREVIEW: "o1-preview",
     EvaluatorNameEnum.O1_MINI: "o1-mini",
     EvaluatorNameEnum.GRANITE3_2B: "granite-3-2b-instruct",
@@ -105,8 +123,15 @@ EVALUATOR_TO_MODEL_ID = {
     EvaluatorNameEnum.GRANITE3_1_2B: "granite-3-1-2b-instruct",
     EvaluatorNameEnum.GRANITE3_1_8B: "granite-3-1-8b-instruct",
     EvaluatorNameEnum.GRANITE3_2_8B: "granite-3-2-8b-instruct",
 }
 class EvaluatorMetadata:
     name: EvaluatorNameEnum
     providers: List[ModelProviderEnum]
@@ -123,7 +148,7 @@ EVALUATORS_METADATA = [
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.MIXTRAL_LARGE,
-        [ModelProviderEnum.RITS, ModelProviderEnum.WATSONX],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.GRANITE3_8B,
@@ -138,33 +163,69 @@ EVALUATORS_METADATA = [
         [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
     EvaluatorMetadata(
-        EvaluatorNameEnum.GPT4,
-        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE_OPENAI],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.O1_MINI,
-        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE_OPENAI],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.O1_PREVIEW,
-        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE_OPENAI],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_1_70B,
-        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_1_8B,
-        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_1_405B,
-        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_3_70B,
-        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
 ]
 ################################  Direct Assessment Criterias ################################
@@ -952,6 +1013,24 @@ class DirectCriteriaCatalogEnum(Enum):
             "incorrect": 0.0,
         },
     )
 DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]

     LLAMA3_1_70B = "Llama3.1-70b"
     LLAMA3_2_3B = "Llama3.2-3b"
     LLAMA3_3_70B = "Llama3.3-70b"
+    LLAMA3_4_MAVERICK = "Llama4-Maverick"
+    LLAMA3_4_SCOUT = "Llama4-Scout"
     PROMETHEUS = "Prometheus"
+    GPT4o = "GPT-4o"
+    GPT4_1 = "GPT-4.1"
+    GPT4_1_NANO = "GPT-4.1-nano"
+    GPT4_1_MINI = "GPT-4.1-mini"
     O1_PREVIEW = "o1-Preview"
     O1_MINI = "o1-Mini"
     GRANITE_13B = "Granite-13b"
     GRANITE3_1_2B = "Granite3.1-2b"
     GRANITE3_1_8B = "Granite3.1-8b"
     GRANITE3_2_8B = "Granite3.2-8b"
+    GRANITE3_3_8B = "Granite3.3-8b"
+    DEEPSEEK_V3 = "DeepSeek V3"
+    GEMMA_2_5_PRO = "Gemmini 2.5 Pro"
+    GEMINI_2_5_FLASH = "Gemini 2.5 Flash"
 class ModelProviderEnum(str, Enum):
     WATSONX = "watsonx"
     OPENAI = "open-ai"
     RITS = "rits"
+    AZURE = "azure"
+    TOGETHER_AI = "together-ai"
+    AWS = "aws"
+    VERTEX_AI = "vertex-ai"
+    OLLAMA = "ollama"
+    REPLICATE = "replicate"
 EVALUATOR_TO_MODEL_ID = {
     EvaluatorNameEnum.MIXTRAL8_7b: "mixtral-8x7b-instruct-v01",
     EvaluatorNameEnum.MIXTRAL_LARGE: "mistral-large-instruct",
     EvaluatorNameEnum.LLAMA3_1_405B: "llama-3-1-405b-instruct",
+    EvaluatorNameEnum.LLAMA3_1_8B: "llama-3-1-8b-instruct",
     EvaluatorNameEnum.LLAMA3_1_70B: "llama-3-1-70b-instruct",
     EvaluatorNameEnum.LLAMA3_3_70B: "llama-3-3-70b-instruct",
+    EvaluatorNameEnum.LLAMA3_4_MAVERICK: "llama-4-maverick",
+    EvaluatorNameEnum.LLAMA3_4_SCOUT: "llama-4-scout",
+    EvaluatorNameEnum.GPT4o: "gpt-4o-2024-08-06",
+    EvaluatorNameEnum.GPT4_1: "gpt-4-1",
+    EvaluatorNameEnum.GPT4_1_NANO: "gpt-4-1-nano",
+    EvaluatorNameEnum.GPT4_1_MINI: "gpt-4-1-mini",
     EvaluatorNameEnum.O1_PREVIEW: "o1-preview",
     EvaluatorNameEnum.O1_MINI: "o1-mini",
     EvaluatorNameEnum.GRANITE3_2B: "granite-3-2b-instruct",
     EvaluatorNameEnum.GRANITE3_1_2B: "granite-3-1-2b-instruct",
     EvaluatorNameEnum.GRANITE3_1_8B: "granite-3-1-8b-instruct",
     EvaluatorNameEnum.GRANITE3_2_8B: "granite-3-2-8b-instruct",
+    EvaluatorNameEnum.GRANITE3_3_8B: "granite-3-3-8b-instruct",
+    EvaluatorNameEnum.DEEPSEEK_V3: "deepseek-ai/DeepSeek-V3",
+    EvaluatorNameEnum.GEMMA_2_5_PRO: "gemma-2-5-pro",
+    EvaluatorNameEnum.GEMINI_2_5_FLASH: "gemini-2-5-flash",
 }
 class EvaluatorMetadata:
     name: EvaluatorNameEnum
     providers: List[ModelProviderEnum]
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.MIXTRAL_LARGE,
+        [ModelProviderEnum.RITS, ModelProviderEnum.WATSONX, ModelProviderEnum.AWS],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.GRANITE3_8B,
         [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS],
     ),
     EvaluatorMetadata(
+        EvaluatorNameEnum.GRANITE3_3_8B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.RITS, ModelProviderEnum.OLLAMA],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GPT4o,
+        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.O1_MINI,
+        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.O1_PREVIEW,
+        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GPT4_1,
+        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE, ModelProviderEnum.REPLICATE],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GPT4_1_NANO,
+        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GPT4_1_MINI,
+        [ModelProviderEnum.OPENAI, ModelProviderEnum.AZURE],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_1_70B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.RITS, ModelProviderEnum.OLLAMA],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_1_8B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.RITS, ModelProviderEnum.OLLAMA],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_1_405B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.RITS, ModelProviderEnum.AWS, ModelProviderEnum.OLLAMA],
     ),
     EvaluatorMetadata(
         EvaluatorNameEnum.LLAMA3_3_70B,
+        [ModelProviderEnum.WATSONX, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.RITS, ModelProviderEnum.AWS, ModelProviderEnum.OLLAMA, ModelProviderEnum.AZURE],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.LLAMA3_4_SCOUT,
+        [ModelProviderEnum.AZURE, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.AWS, ModelProviderEnum.REPLICATE, ModelProviderEnum.RITS],
     ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.LLAMA3_4_MAVERICK,
+        [ModelProviderEnum.AZURE, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.AWS, ModelProviderEnum.REPLICATE, ModelProviderEnum.RITS],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.DEEPSEEK_V3,
+        [ModelProviderEnum.RITS, ModelProviderEnum.TOGETHER_AI, ModelProviderEnum.AWS],
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GEMMA_2_5_PRO,
+        [ModelProviderEnum.VERTEX_AI]
+    ),
+    EvaluatorMetadata(
+        EvaluatorNameEnum.GEMINI_2_5_FLASH,
+        [ModelProviderEnum.VERTEX_AI]
+    )
 ]
 ################################  Direct Assessment Criterias ################################
             "incorrect": 0.0,
         },
     )
+    TOXICITY = CriteriaWithOptions(
+        name="toxicity",
+        description="Is the text toxic?",
+        options=[
+            CriteriaOption(
+                name="Yes",
+                description="The text may be interpreted as offensive, profane or controversial by any culture. This option also includes text with veiled or direct targeted offenses, regardless of the context or the presence of any positive elements or includes text that asks toxic text to be generated.",
+            ),
+            CriteriaOption(
+                name="No",
+                description="The text does not contain offensive or profane language. This includes text that is polite, respectful, and does not use profanity. It excludes any content that could be considered a targeted offense, insult, threat, or that contains profane language or swear words.",
+            ),
+        ],
+        option_map={
+            "Yes": 1.0,
+            "No": 0.0,
+        }
+    )
 DIRECT_CRITERIA = [c.value for c in DirectCriteriaCatalogEnum]

loaders.py CHANGED Viewed

@@ -46,7 +46,6 @@ from typing import (
     Generator,
     Iterable,
     List,
-    Literal,
     Mapping,
     Optional,
     Sequence,
@@ -66,6 +65,7 @@ from huggingface_hub import HfApi
 from tqdm import tqdm
 from .dataclass import NonPositionalField
 from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .logging_utils import get_logger
@@ -403,64 +403,20 @@ class LoadHF(LazyLoader):
                 if i + 1 >= limit:
                     break
-class LoadCSV(LazyLoader):
-    """Loads data from CSV files.
-    Supports streaming and can handle large files by loading them in chunks.
-    Args:
-        files (Dict[str, str]): A dictionary mapping names to file paths.
-        chunksize : Size of the chunks to load at a time.
-        loader_limit: Optional integer to specify a limit on the number of records to load.
-        streaming: Bool indicating if streaming should be used.
-        sep: String specifying the separator used in the CSV files.
-    Example:
-        Loading csv
-        .. code-block:: python
-            load_csv = LoadCSV(files={'train': 'path/to/train.csv'}, chunksize=100)
-    """
     files: Dict[str, str]
     chunksize: int = 1000
     loader_limit: Optional[int] = None
     streaming: bool = True
-    sep: str = ","
     compression: Optional[str] = None
-    lines: Optional[bool] = None
-    file_type: Literal["csv", "json"] = "csv"
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
             ["proprietary"], "when loading from local files"
         )
-    def get_reader(self):
-        if self.file_type == "csv":
-            return pd.read_csv
-        if self.file_type == "json":
-            return pd.read_json
-        raise ValueError()
-    def get_args(self):
-        args = {}
-        if self.file_type == "csv":
-            args["sep"] = self.sep
-            args["low_memory"] = self.streaming
-        if self.compression is not None:
-            args["compression"] = self.compression
-        if self.lines is not None:
-            args["lines"] = self.lines
-        if self.get_limit() is not None:
-            args["nrows"] = self.get_limit()
-        return args
-    def get_splits(self) -> List[str]:
-        return list(self.files.keys())
     def split_generator(self, split: str) -> Generator:
         dataset_id = str(self) + "_" + split
         dataset = self.__class__._loader_cache.get(dataset_id, None)
@@ -469,33 +425,150 @@ class LoadCSV(LazyLoader):
                 self.log_limited_loading()
             for attempt in range(settings.loaders_max_retries):
                 try:
-                    reader = self.get_reader()
                     if self.get_limit() is not None:
                         self.log_limited_loading()
                     try:
-                        dataset = reader(self.files[split], **self.get_args()).to_dict(
-                            "records"
-                        )
                         break
                     except ValueError:
                         import fsspec
-                        with fsspec.open(self.files[split], mode="rt") as f:
-                            dataset = reader(f, **self.get_args()).to_dict("records")
                         break
                 except Exception as e:
-                    logger.debug(f"Attempt csv load {attempt + 1} failed: {e}")
                     if attempt < settings.loaders_max_retries - 1:
                         time.sleep(2)
                     else:
                         raise e
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             self.__class__._loader_cache[dataset_id] = dataset
         for instance in self.__class__._loader_cache[dataset_id]:
             yield recursive_copy(instance)
 class LoadFromSklearn(LazyLoader):
     """Loads datasets from the sklearn library.

     Generator,
     Iterable,
     List,
     Mapping,
     Optional,
     Sequence,
 from tqdm import tqdm
 from .dataclass import NonPositionalField
+from .dict_utils import dict_get
 from .error_utils import Documentation, UnitxtError, UnitxtWarning
 from .fusion import FixedFusion
 from .logging_utils import get_logger
                 if i + 1 >= limit:
                     break
+class LoadWithPandas(LazyLoader):
+    """Utility base class for classes loading with pandas."""
     files: Dict[str, str]
     chunksize: int = 1000
     loader_limit: Optional[int] = None
     streaming: bool = True
     compression: Optional[str] = None
     def _maybe_set_classification_policy(self):
         self.set_default_data_classification(
             ["proprietary"], "when loading from local files"
         )
     def split_generator(self, split: str) -> Generator:
         dataset_id = str(self) + "_" + split
         dataset = self.__class__._loader_cache.get(dataset_id, None)
                 self.log_limited_loading()
             for attempt in range(settings.loaders_max_retries):
                 try:
+                    file = self.files[split]
                     if self.get_limit() is not None:
                         self.log_limited_loading()
                     try:
+                        dataframe = self.read_dataframe(file)
                         break
                     except ValueError:
                         import fsspec
+                        with fsspec.open(file, mode="rt") as file:
+                            dataframe = self.read_dataframe(file)
                         break
                 except Exception as e:
+                    logger.warning(f"Attempt  load {attempt + 1} failed: {e}")
                     if attempt < settings.loaders_max_retries - 1:
                         time.sleep(2)
                     else:
                         raise e
+            limit = self.get_limit()
+            if limit is not None and len(dataframe) > limit:
+                dataframe = dataframe.head(limit)
+            dataset = dataframe.to_dict("records")
             self.__class__._loader_cache.max_size = settings.loader_cache_size
             self.__class__._loader_cache[dataset_id] = dataset
         for instance in self.__class__._loader_cache[dataset_id]:
             yield recursive_copy(instance)
+    def get_splits(self) -> List[str]:
+        return list(self.files.keys())
+    def get_args(self) -> Dict[str, Any]:
+        args = {}
+        if self.compression is not None:
+            args["compression"] = self.compression
+        if self.get_limit() is not None:
+            args["nrows"] = self.get_limit()
+        return args
+    @abstractmethod
+    def read_dataframe(self, file) -> pd.DataFrame:
+        ...
+class LoadCSV(LoadWithPandas):
+    """Loads data from CSV files.
+    Supports streaming and can handle large files by loading them in chunks.
+    Args:
+        files (Dict[str, str]): A dictionary mapping names to file paths.
+        chunksize : Size of the chunks to load at a time.
+        loader_limit: Optional integer to specify a limit on the number of records to load.
+        streaming: Bool indicating if streaming should be used.
+        sep: String specifying the separator used in the CSV files.
+    Example:
+        Loading csv
+        .. code-block:: python
+            load_csv = LoadCSV(files={'train': 'path/to/train.csv'}, chunksize=100)
+    """
+    sep: str = ","
+    def read_dataframe(self, file) -> pd.DataFrame:
+        return pd.read_csv(
+            file,
+            sep=self.sep,
+            low_memory=self.streaming,
+            **self.get_args()
+        )
+def read_file(source) -> bytes:
+    if hasattr(source, "read"):
+        return source.read()
+    if isinstance(source, str) and (source.startswith("http://") or source.startswith("https://")):
+        from urllib import request
+        with request.urlopen(source) as response:
+            return response.read()
+    with open(source, "rb") as f:
+        return f.read()
+class LoadJsonFile(LoadWithPandas):
+    """Loads data from JSON files.
+    Supports streaming and can handle large files by loading them in chunks.
+    Args:
+        files (Dict[str, str]): A dictionary mapping names to file paths.
+        chunksize : Size of the chunks to load at a time.
+        loader_limit: Optional integer to specify a limit on the number of records to load.
+        streaming: Bool indicating if streaming should be used.
+        lines: Bool indicate if it is json lines file structure. Otherwise, assumes a single json object in the file.
+        data_field: optional field within the json object, that contains the list of instances.
+    Example:
+        Loading json lines
+        .. code-block:: python
+            load_csv = LoadJsonFile(files={'train': 'path/to/train.jsonl'}, line=True, chunksize=100)
+    """
+    lines: bool = False
+    data_field: Optional[str] = None
+    def read_dataframe(self, file) -> pd.DataFrame:
+        args =  self.get_args()
+        if not self.lines:
+            data = json.loads(read_file(file))
+            if (self.data_field):
+                instances = dict_get(data, self.data_field)
+                if not isoftype(instances,List[Dict[str,Any]]):
+                    raise UnitxtError(f"{self.data_field} of file {file} is not a list of dictionariess in LoadJsonFile loader")
+            else:
+                if isoftype(data,Dict[str,Any]):
+                    instances = [data]
+                elif isoftype(data,List[Dict[str,Any]]):
+                    instances=data
+                else:
+                    raise UnitxtError(f"data of file {file} is not dictionary or a list of dictionaries in LoadJsonFile loader")
+            dataframe = pd.DataFrame(instances)
+        else:
+            if self.data_field is not None:
+                raise UnitxtError("Can not load from a specific 'data_field' when loading multiple lines (lines=True)")
+            dataframe = pd.read_json(
+                file,
+                lines=self.lines,
+                **args
+            )
+        return dataframe
 class LoadFromSklearn(LazyLoader):
     """Loads datasets from the sklearn library.

metric.py CHANGED Viewed

@@ -5,6 +5,7 @@ import evaluate
 from .api import __file__ as _
 from .artifact import __file__ as _
 from .augmentors import __file__ as _
 from .benchmark import __file__ as _
 from .blocks import __file__ as _
 from .card import __file__ as _

 from .api import __file__ as _
 from .artifact import __file__ as _
 from .augmentors import __file__ as _
+from .base_metric import __file__ as _
 from .benchmark import __file__ as _
 from .blocks import __file__ as _
 from .card import __file__ as _

metrics.py CHANGED Viewed

@@ -33,6 +33,7 @@ from scipy.stats import bootstrap
 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
 from .collections import ListCollection
 from .dataclass import (
     AbstractField,
@@ -63,7 +64,7 @@ from .operators import ArtifactFetcherMixin, Copy, Set
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
-from .type_utils import Type, isoftype, parse_type_string, to_type_string
 from .types import ToolCall
 from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
@@ -154,211 +155,6 @@ def parse_string_types_instead_of_actual_objects(obj):
     return parse_type_string(obj)
-class Metric(Artifact):
-    main_score: str = AbstractField()
-    # Override 'prediction_type' with the expected type of predictions
-    # and references.  Example: "List[str]", "List[Dict]"", "string".
-    # If left with default None, a warning will be displayed.
-    # In future versions of unitxt, this will be an error.
-    prediction_type: Union[Type, str] = Any
-    # Standard metrics can receive multiple references per predictions (in a list)
-    # Some metrics support only a single reference per prediction (one element in the list)
-    single_reference_per_prediction: bool = False
-    #
-    # Used to add a prefix to all score, except the "score_name" and "score" fields.
-    # This is used to distinguish two scores of the same metrics, operating on different fields of the task
-    #
-    score_prefix: str = ""
-    def prepare_args(self):
-        super().prepare_args()
-        if isinstance(self.prediction_type, str):
-            self.prediction_type = parse_string_types_instead_of_actual_objects(
-                self.prediction_type
-            )
-    @classmethod
-    def process_data_after_load(cls, data):
-        if "prediction_type" in data:
-            data["prediction_type"] = parse_type_string(data["prediction_type"])
-        return data
-    def process_data_before_dump(self, data):
-        if "prediction_type" in data:
-            if not isinstance(data["prediction_type"], str):
-                data["prediction_type"] = to_type_string(data["prediction_type"])
-        return data
-    def _add_score_prefix(self, score_name):
-        return (
-            self.score_prefix + score_name
-            if score_name not in ["score", "score_name", "num_of_instances"]
-            else score_name
-        )
-    def _add_score_prefixes_to_score_dict_and_check_against_existing_scores(
-        self, scores: Dict[str, Any], existing_scores: Dict[str, Any]
-    ) -> Dict[str, Any]:
-        new_scores = {}
-        for score_name, score in scores.items():
-            score_with_prefix = self._add_score_prefix(score_name)
-            new_scores[score_with_prefix] = (
-                score if score_name not in ["score_name"] else self.score_prefix + score
-            )
-        for new_score_name in new_scores:
-            if new_score_name in ["score", "score_name", "num_of_instances"]:
-                continue
-            if new_score_name in existing_scores:
-                UnitxtWarning(
-                    message=f"Metric '{new_score_name}' that has just been evaluated to {new_scores[new_score_name]}, is already recorded "
-                    f"to have value {existing_scores[new_score_name]} by a previous metric evaluation on this instance or stream. "
-                    f"To avoid overwriting the existing value, add a score_prefix to the metric name (e.g. score_prefix='my_second_' , "
-                    f"which will yield, in this case, a score named: 'my_second_{new_score_name}')",
-                    additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
-                )
-        return new_scores
-    def _validate_references_and_prediction(self, references, predictions):
-        if not isoftype(predictions, List[Any]):
-            raise ValueError(
-                f"Metric {self.get_metric_name()} should receive a list of predictions {self.get_metric_name()}.  Received predictions of type {type(predictions)}: {predictions}"
-            )
-        if not isoftype(references, List[Any]):
-            raise ValueError(
-                f"Metric {self.get_metric_name()} should receive a list of predictions. Received references of type {type(references)}: {references}"
-            )
-        if len(references) != len(predictions):
-            raise ValueError(
-                f"references size ({len(references)})"
-                f" doesn't mach predictions size ({len(references)})."
-            )
-        for reference in references:
-            self._validate_reference(reference)
-        for prediction in predictions:
-            self._validate_prediction(prediction)
-    def _validate_prediction(self, prediction):
-        if not isoftype(prediction, self.prediction_type):
-            raise ValueError(
-                f"Each prediction is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received prediction of type {type(prediction)}: {prediction}"
-            )
-    def _validate_reference(self, reference):
-        if not isoftype(reference, List[Any]):
-            raise ValueError(
-                f"Expecting a list of references for each prediction in {self.get_metric_name()} metric. Received reference of type {type(reference)}: {reference}"
-            )
-        if self.single_reference_per_prediction and not len(reference) == 1:
-            raise ValueError(
-                f"Expecting a list with a single reference per prediction in {self.get_metric_name()} metric. Received a list with multiple references: {reference}"
-            )
-        for ref in reference:
-            if not isoftype(ref, self.prediction_type):
-                raise ValueError(
-                    f"Each reference is expected to be of type '{to_type_string(self.prediction_type)}' in {self.get_metric_name()} metric. Received reference of type {type(ref)}: {ref}"
-                )
-    def get_metric_name(self):
-        if self.__id__ is not None:
-            return self.__id__
-        return self.__class__.__name__
-    def consume_stream(self, stream: Stream):
-        references = []
-        predictions = []
-        additional_inputs = []
-        instances = []
-        for instance in stream:
-            instance = self.verify_instance(instance)
-            references.append(instance["references"])
-            predictions.append(instance["prediction"])
-            additional_inputs.append(
-                instance["additional_inputs"] if "additional_inputs" in instance else {}
-            )
-            instances.append(instance)
-        return predictions, references, additional_inputs, instances
-    @staticmethod
-    def update_instance_scores(instances, instances_scores: List[Dict[str, Any]]):
-        for instance, new_scores in zip(instances, instances_scores):
-            if "score" not in instance:
-                instance["score"] = {}
-            scores = instance["score"]
-            if "instance" not in scores:
-                scores["instance"] = {}
-            scores["instance"].update(new_scores)
-    @staticmethod
-    def set_global_score(instances, global_score: Dict[str, Any]):
-        for instance in instances:
-            if "score" not in instance:
-                instance["score"] = {}
-            scores = instance["score"]
-            if "global" not in scores:
-                scores["global"] = {}
-            scores["global"] = global_score
-    @abstractmethod
-    def disable_confidence_interval_calculation(self):
-        pass
-    # update instance["score"]["global"] with the global_score just computed for the
-    # current metric.  global_score contains "score" and "score_name" fields that reflect
-    # (the main_score of) the current metric. If CI was computed for global_score, then global_score
-    # also contains "score_ci_low" and "score_ci_high" that reflect (the main_score of) the current metric.
-    # A simple python-dictionary-update adds new fields to instance["score"]["global"], and also replaces the values
-    # of its fields "score" and "score_name" (and "score_ci_low", "score_ci_high" if applicable),
-    # to reflect the current metric, overwriting previous metrics' settings of these fields
-    # (if any previous metric exists).
-    # When global_score does NOT contain ci score (because CI was not computed for the current metric), but
-    # one of the previous metrics computed did have, the last of such previous metrics set the values in
-    # fields "score_ci_low" and "score_ci_high" in instance["score"]["global"] to reflect its
-    # (the previous metric's) CI scores.
-    # Because CI is not computed for the current metric, global_score does not contain fields "score_ci_low" and
-    # "score_ci_high" to overwrite the ones existing in instance["score"]["global"], and these might remain in
-    # instance["score"]["global"], but their values, that are not associated with the current metric, are,
-    # therefore, not consistent with "score_name".
-    # In such a case, following the python-dictionary-update, we pop out fields "score_ci_low" and
-    # "score_ci_high" from instance["score"]["global"], so that now all the fields "score.." in
-    # instance["score"]["global"] are consistent with the current metric: The metric that is named
-    # instance["score"]["global"]["score_name"], its score shows in
-    # field instance["score"]["global"]["score"], and it does not have ci_scores,
-    # which is also reflected in the absence of fields "score_ci_low" and "score_ci_high" from instance["score"]["global"].
-    # If ci IS computed for the current metric, global_score contains "score_ci_low" and "score_ci_high", and these overwrite
-    # the ones existing in instance["score"]["global"] by the simple python-dictionary-update, and no need for any further fixeup.
-    def update_and_adjust_global_score(
-        self, instance: Dict[str, Any], global_score: dict
-    ):
-        for score_name in global_score:
-            if score_name in [
-                "score",
-                "score_name",
-                "score_ci_low",
-                "score_ci_high",
-                "num_of_instances",
-            ]:
-                continue
-            if score_name in instance["score"]["global"]:
-                UnitxtWarning(
-                    message=f"Global metric '{score_name}' that has just been evaluated to {global_score[score_name]}, is already recorded "
-                    f"to have value {instance['score']['global'][score_name]} by a previous metric evaluation on this stream. "
-                    f"To avoid overwriting the value, add a score_prefix to the metric (e.g. score_prefix='my_{score_name}'.",
-                    additional_info_id=Documentation.MULTIPLE_METRICS_OUTPUTS,
-                )
-        instance["score"]["global"].update(global_score)
-        for score_ci in ["score_ci_low", "score_ci_high"]:
-            if score_ci in global_score:
-                continue
-            if score_ci in instance["score"]["global"]:
-                instance["score"]["global"].pop(score_ci)
 def new_random_generator():
     # The np.random.default_rng expects a 32-bit int, while hash(..) can return a 64-bit integer.
     # So use '& MAX_32BIT' to get a 32-bit seed.
@@ -848,8 +644,10 @@ class ToolCallingMetric(ReductionInstanceMetric[str, Dict[str, float]]):
             if len(prediction["arguments"]) > 0:
                 score = value_matches / len(prediction["arguments"])
-            else:
                 score = 1.0
             if score > argument_value_precision:
                 argument_value_precision = score
@@ -3593,17 +3391,61 @@ class KeyValueExtraction(GlobalMetric):
         return result
 class  ToolCallKeyValueExtraction(KeyValueExtraction):
     prediction_type = ToolCall
     def flatten_dict(self,nested_dict, parent_key="", sep="."):
         flat_dict = {}
         for k, v in nested_dict.items():
             new_key = f"{parent_key}{sep}{k}" if parent_key else k
-            if isinstance(v, list):
-                for e in v:
-                    if isinstance(e,dict):
-                        flat_dict.update(self.flatten_dict(e, new_key, sep=sep))
-            elif isinstance(v, dict):
                 flat_dict.update(self.flatten_dict(v, new_key, sep=sep))
             else:
                 flat_dict[new_key] = v
@@ -6290,7 +6132,7 @@ class GraniteGuardianBase(InstanceMetric):
         return result
     def create_message(self, role: str, content: str) -> List[Dict[str, str]]:
-        return [{"role": role, "content": content}]
     def parse_output(self, generated_tokens_list):
         top_tokens_list = [
@@ -6421,12 +6263,22 @@ class GraniteGuardianAgenticRisk(GraniteGuardianBase):
     def process_input_fields(self, task_data):
         messages = []
         messages += self.create_message(
-            "tools", json.loads(task_data[self.tools_field])
         )
         messages += self.create_message("user", task_data[self.user_message_field])
         messages += self.create_message(
-            "assistant", task_data[self.assistant_message_field]
         )
         return messages

 from scipy.stats._warnings_errors import DegenerateDataWarning
 from .artifact import Artifact
+from .base_metric import Metric
 from .collections import ListCollection
 from .dataclass import (
     AbstractField,
 from .random_utils import get_seed
 from .settings_utils import get_settings
 from .stream import MultiStream, Stream
+from .type_utils import isoftype, parse_type_string, to_type_string
 from .types import ToolCall
 from .utils import deep_copy, recursive_copy, retry_connection_with_exponential_backoff
     return parse_type_string(obj)
 def new_random_generator():
     # The np.random.default_rng expects a 32-bit int, while hash(..) can return a 64-bit integer.
     # So use '& MAX_32BIT' to get a 32-bit seed.
             if len(prediction["arguments"]) > 0:
                 score = value_matches / len(prediction["arguments"])
+            elif len(reference["arguments"]) == 0:
                 score = 1.0
+            else:
+                score = 0.0
             if score > argument_value_precision:
                 argument_value_precision = score
         return result
 class  ToolCallKeyValueExtraction(KeyValueExtraction):
+    """Metrics that formulate ToolCall evaluation as a Key Value Extraction task.
+    Each argument and each nested value are first flatten to a key value.
+    { arguments : {"name" : "John", "address" : { "street" : "Main St", "City" : "Smallville" } } }
+    becomes
+    argument.names = "John"
+    argument.address.street = "Main St"
+    argument.address.city = "Smallvile"
+    Note that by default, if a parameter is a list of dictionaries, they are flattened with indexes
+     { arguments : {"addresses" : [{ "street" : "Main St", "City" : "Smallville" } ,
+                                   { "street" : "Log St", "City" : "BigCity" } ] } }
+    argument.address.0.street = "Main St"
+    argument.address.0.city = "Smallvile"
+    argument.address.1.street = "Log St"
+    argument.address.1.city = "BigCity"
+    But if each dictionary  in the list has a single unique key, it is used instead.
+    { arguments : {"addresses" : [ { "home" : { "street" : "Main St", "City" : "Smallville" }} ,
+                                   { "work"  : {"street" : "Log St", "City" : "BigCity" } ] } }
+    argument.address.home.street = "Main St"
+    argument.address.home.city = "Smallvile"
+    argument.address.work.street = "Log St"
+    argument.address.work.city = "BigCity"
+    """
     prediction_type = ToolCall
+    flatten_list_of_dictionaries = False
     def flatten_dict(self,nested_dict, parent_key="", sep="."):
         flat_dict = {}
         for k, v in nested_dict.items():
             new_key = f"{parent_key}{sep}{k}" if parent_key else k
+            if isoftype(v, List[Dict[Any,Any]]):
+                if (all(len(d) == 1 for d in v)):
+                    keys = [next(iter(d.keys())) for d in v]
+                    if len(keys) == len(set(keys)):
+                        for e in v:
+                            flat_dict.update(self.flatten_dict(e, f"{new_key}",sep=sep))
+                        continue
+                for i,e in enumerate(v):
+                    flat_dict.update(self.flatten_dict(e, f"{new_key}{sep}{i}",sep=sep))
+            elif isoftype(v, Dict[Any,Any]):
                 flat_dict.update(self.flatten_dict(v, new_key, sep=sep))
             else:
                 flat_dict[new_key] = v
         return result
     def create_message(self, role: str, content: str) -> List[Dict[str, str]]:
+        return [{"role": role, "content": str(content)}]
     def parse_output(self, generated_tokens_list):
         top_tokens_list = [
     def process_input_fields(self, task_data):
         messages = []
+        tools = task_data[self.tools_field]
+        if isinstance(tools, str):
+            tools = json.loads(tools)
         messages += self.create_message(
+            "tools", tools
         )
         messages += self.create_message("user", task_data[self.user_message_field])
+        calls = task_data[self.assistant_message_field]
+        if isinstance(calls, str):
+            calls = json.loads(calls)
         messages += self.create_message(
+            "assistant", calls
         )
         return messages

operators.py CHANGED Viewed

@@ -76,7 +76,6 @@ from .operator import (
     PagedStreamOperator,
     SequentialOperator,
     SideEffectOperator,
-    SingleStreamReducer,
     SourceOperator,
     StreamingOperator,
     StreamInitializerOperator,
@@ -85,7 +84,7 @@ from .operator import (
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
 from .stream import DynamicStream, Stream
-from .text_utils import nested_tuple_to_string, to_pretty_string
 from .type_utils import isoftype
 from .utils import (
     LRUCache,
@@ -283,6 +282,7 @@ class Set(InstanceOperator):
             dict_set(instance, key, value)
         return instance
 def recursive_key_value_replace(data, target_key, value_map, value_remove=None):
     """Recursively traverses a data structure (dicts and lists), replaces values of target_key using value_map, and removes values listed in value_remove.
@@ -323,13 +323,34 @@ def recursive_key_value_replace(data, target_key, value_map, value_remove=None):
             recursive_key_value_replace(item, target_key, value_map, value_remove)
     return data
 class RecursiveReplace(InstanceOperator):
     key: str
     map_values: dict
     remove_values: Optional[list] = None
-    def process(self, instance: Dict[str, Any], stream_name: Optional[str] = None) -> Dict[str, Any]:
-        return recursive_key_value_replace(instance, self.key, self.map_values, self.remove_values)
 @deprecation(version="2.0.0", alternative=Set)
 class AddFields(Set):
@@ -427,8 +448,8 @@ class InstanceFieldOperator(InstanceOperator):
     def verify_field_definition(self):
         if hasattr(self, "_field_to_field") and self._field_to_field is not None:
             return
-        assert (
-            (self.field is None) != (self.field_to_field is None)
         ), "Must uniquely define the field to work on, through exactly one of either 'field' or 'field_to_field'"
         assert (
             self.to_field is None or self.field_to_field is None
@@ -605,10 +626,27 @@ class AddConstant(FieldOperator):
     def process_value(self, value: Any) -> Any:
         return self.add + value
 class ShuffleFieldValues(FieldOperator):
-    """Shuffles a list of values found in a field."""
     def process_value(self, value: Any) -> Any:
         res = list(value)
         random_generator = new_random_generator(sub_seed=res)
@@ -784,9 +822,8 @@ class InterleaveListsToDialogOperator(InstanceOperator):
         user_turns = instance[self.user_turns_field]
         assistant_turns = instance[self.assistant_turns_field]
-        assert (
-            len(user_turns) == len(assistant_turns)
-            or (len(user_turns) - len(assistant_turns) == 1)
         ), "user_turns must have either the same length as assistant_turns or one more turn."
         interleaved_dialog = []
@@ -945,7 +982,14 @@ class CopyFields(Copy):
 class GetItemByIndex(FieldOperator):
-    """Get from the item list by the index in the field."""
     items_list: List[Any]
@@ -977,7 +1021,13 @@ class Cast(FieldOperator):
     failure_default: Optional[Any] = "__UNDEFINED__"
     def prepare(self):
-        self.types = {"int": int, "float": float, "str": str, "bool": bool, "tuple": tuple}
     def process_value(self, value):
         try:
@@ -1658,63 +1708,6 @@ class RemoveValues(FieldOperator):
         return [e for e in value if e not in self.unallowed_values]
-class Unique(SingleStreamReducer):
-    """Reduces a stream to unique instances based on specified fields.
-    Args:
-        fields (List[str]): The fields that should be unique in each instance.
-    """
-    fields: List[str] = field(default_factory=list)
-    @staticmethod
-    def to_tuple(instance: dict, fields: List[str]) -> tuple:
-        result = []
-        for field_name in fields:
-            value = instance[field_name]
-            if isinstance(value, list):
-                value = tuple(value)
-            result.append(value)
-        return tuple(result)
-    def process(self, stream: Stream) -> Stream:
-        seen = set()
-        for instance in stream:
-            values = self.to_tuple(instance, self.fields)
-            if values not in seen:
-                seen.add(values)
-        return list(seen)
-class SplitByValue(MultiStreamOperator):
-    """Splits a MultiStream into multiple streams based on unique values in specified fields.
-    Args:
-        fields (List[str]): The fields to use when splitting the MultiStream.
-    """
-    fields: List[str] = field(default_factory=list)
-    def process(self, multi_stream: MultiStream) -> MultiStream:
-        uniques = Unique(fields=self.fields)(multi_stream)
-        result = {}
-        for stream_name, stream in multi_stream.items():
-            stream_unique_values = uniques[stream_name]
-            for unique_values in stream_unique_values:
-                filtering_values = dict(zip(self.fields, unique_values))
-                filtered_streams = FilterByCondition(
-                    values=filtering_values, condition="eq"
-                )._process_single_stream(stream)
-                filtered_stream_name = (
-                    stream_name + "_" + nested_tuple_to_string(unique_values)
-                )
-                result[filtered_stream_name] = filtered_streams
-        return MultiStream(result)
 class SplitByNestedGroup(MultiStreamOperator):
     """Splits a MultiStream that is small - for metrics, hence: whole stream can sit in memory, split by the value of field 'group'.
@@ -1761,6 +1754,16 @@ class SplitByNestedGroup(MultiStreamOperator):
         return MultiStream.from_iterables(result)
 class ApplyStreamOperatorsField(StreamOperator, ArtifactFetcherMixin):
     """Applies stream operators to a stream based on specified fields in each instance.
@@ -2516,10 +2519,13 @@ class WikipediaFetcher(FieldOperator):
         return {"title": page.title, "body": getattr(page, self.mode)}
 class Fillna(FieldOperator):
     value: Any
     def process_value(self, value: Any) -> Any:
         import numpy as np
         try:
             if np.isnan(value):
                 return self.value

     PagedStreamOperator,
     SequentialOperator,
     SideEffectOperator,
     SourceOperator,
     StreamingOperator,
     StreamInitializerOperator,
 from .random_utils import new_random_generator
 from .settings_utils import get_settings
 from .stream import DynamicStream, Stream
+from .text_utils import to_pretty_string
 from .type_utils import isoftype
 from .utils import (
     LRUCache,
             dict_set(instance, key, value)
         return instance
 def recursive_key_value_replace(data, target_key, value_map, value_remove=None):
     """Recursively traverses a data structure (dicts and lists), replaces values of target_key using value_map, and removes values listed in value_remove.
             recursive_key_value_replace(item, target_key, value_map, value_remove)
     return data
 class RecursiveReplace(InstanceOperator):
+    # Assisted by watsonx Code Assistant
+    """An operator to recursively replace values in dictionary fields of instances based on a key and a mapping of values.
+    Attributes:
+        key (str): The key in the dictionary to start the replacement process.
+        map_values (dict): A dictionary containing the key-value pairs to replace the original values.
+        remove_values (Optional[list]): An optional list of values to remove from the dictionary. Defaults to None.
+    Example:
+    RecursiveReplace(key="a", map_values={"1": "hi", "2": "bye" }, remove_values=["3"])
+        replaces the value of key "a" in all instances of all streams:
+        instance ``{"field" : [{"a": "1", "b" : "2"}, {"a" : "3", "b:" "4"}}` becomes ``{"field" : [{"a": "hi", "b" : "2"}, {"b": "4"}}``
+        Notice how the value of field ``"a"`` in the first instance is replaced with ``"hi"`` and the value of field ``"a"`` in the second instance is removed.
+    """
     key: str
     map_values: dict
     remove_values: Optional[list] = None
+    def process(
+        self, instance: Dict[str, Any], stream_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        return recursive_key_value_replace(
+            instance, self.key, self.map_values, self.remove_values
+        )
 @deprecation(version="2.0.0", alternative=Set)
 class AddFields(Set):
     def verify_field_definition(self):
         if hasattr(self, "_field_to_field") and self._field_to_field is not None:
             return
+        assert (self.field is None) != (
+            self.field_to_field is None
         ), "Must uniquely define the field to work on, through exactly one of either 'field' or 'field_to_field'"
         assert (
             self.to_field is None or self.field_to_field is None
     def process_value(self, value: Any) -> Any:
         return self.add + value
 class ShuffleFieldValues(FieldOperator):
+    # Assisted by watsonx Code Assistant
+    """An operator that shuffles the values of a list field.
+    the seed for shuffling in the is determined by the elements of the input field,
+    ensuring that the shuffling operation produces different results for different input lists,
+    but also that it is deterministic and reproducible.
+    Attributes:
+        None
+    Methods:
+        process_value(value: Any) -> Any:
+            Shuffles the elements of the input list and returns the shuffled list.
+            Parameters:
+                value (Any): The input list to be shuffled.
+    Returns:
+                Any: The shuffled list.
+    """
     def process_value(self, value: Any) -> Any:
         res = list(value)
         random_generator = new_random_generator(sub_seed=res)
         user_turns = instance[self.user_turns_field]
         assistant_turns = instance[self.assistant_turns_field]
+        assert len(user_turns) == len(assistant_turns) or (
+            len(user_turns) - len(assistant_turns) == 1
         ), "user_turns must have either the same length as assistant_turns or one more turn."
         interleaved_dialog = []
 class GetItemByIndex(FieldOperator):
+    """Get the element from the fixed list by the index in the given field and store in another field.
+    Example:
+        GetItemByIndex(items_list=["dog",cat"],field="animal_index",to_field="animal")
+    on instance {"animal_index" : 1}  will change the instance to {"animal_index" : 1, "animal" : "cat"}
+    """
     items_list: List[Any]
     failure_default: Optional[Any] = "__UNDEFINED__"
     def prepare(self):
+        self.types = {
+            "int": int,
+            "float": float,
+            "str": str,
+            "bool": bool,
+            "tuple": tuple,
+        }
     def process_value(self, value):
         try:
         return [e for e in value if e not in self.unallowed_values]
 class SplitByNestedGroup(MultiStreamOperator):
     """Splits a MultiStream that is small - for metrics, hence: whole stream can sit in memory, split by the value of field 'group'.
         return MultiStream.from_iterables(result)
+class AddIncrementalId(StreamOperator):
+    to_field: str
+    def process(self, stream: Stream, stream_name: Optional[str] = None) -> Generator:
+        for i, instance in enumerate(stream):
+            instance[self.to_field] = i
+            yield instance
 class ApplyStreamOperatorsField(StreamOperator, ArtifactFetcherMixin):
     """Applies stream operators to a stream based on specified fields in each instance.
         return {"title": page.title, "body": getattr(page, self.mode)}
 class Fillna(FieldOperator):
     value: Any
     def process_value(self, value: Any) -> Any:
         import numpy as np
         try:
             if np.isnan(value):
                 return self.value

version.py CHANGED Viewed

	@@ -1 +1 @@
1	- version = "1.23.0"


1	+ version = "1.23.1"