Spaces:

sorgfresser
/

valid_efficiency_score

Sleeping

File size: 7,866 Bytes

# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This is a module to compute the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks as
# proposed in "Can LLM Already Serve as a Database Interface?
# A Big Bench for Large-Scale Database Grounded Text-to-SQLs" (Li et al., 2023)

import evaluate
import datasets
from time import time
import numpy as np
from math import sqrt

_CITATION = """\
@article{li2023can,
  title={Can llm already serve as a database interface? a big bench for large-scale database grounded text-to-sqls},
  author={Li, Jinyang and Hui, Binyuan and Qu, Ge and Li, Binhua and Yang, Jiaxi and Li, Bowen and Wang, Bailin and Qin, Bowen and Cao, Rongyu and Geng, Ruiying and others},
  journal={arXiv preprint arXiv:2305.03111},
  year={2023}
}
"""

_DESCRIPTION = """\
This module computes the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks.
"""

_KWARGS_DESCRIPTION = """
Calculates how good the predictions are given some ground truth sql queries, using the Valid Efficiency Score (VES).
Args:
    predictions: list of predictions to score. Each predictions
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    execute: function that takes a sql query and returns a result. 
        The result should be a list of tuples, each tuple containing the values of a row.
    filter_func: function that takes a string and returns a boolean.
        If True, the string is kept, otherwise it is dropped.
    num_executions: number of times to execute each sql query to get the execution time.
Returns:
    ves: Valid Efficiency Score of the predictions compared to the references.
Examples:
    >>> my_new_module = evaluate.load("valid_efficiency_score")
    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
    >>> print(results)
    {'ves': 1.0}
"""

# TODO: Define external resources urls if needed
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"


@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ValidEfficiencyScore(evaluate.Metric):
    """Valid Efficiency Score (VES) metric for text-to-SQL tasks."""

    def _info(self):
        return evaluate.MetricInfo(
            # This is the description that will appear on the modules page.
            module_type="metric",
            description=_DESCRIPTION,
            citation=_CITATION,
            inputs_description=_KWARGS_DESCRIPTION,
            # This defines the format of each prediction and reference
            features=datasets.Features({
                'predictions': datasets.Value('string'),
                'references': datasets.Value('string'),
            }),
            # Homepage of the module for documentation
            homepage="http://module.homepage",
            # Additional links to the codebase or references
            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
            reference_urls=[]
        )

    def _download_and_prepare(self, dl_manager):
        """Optional: download external resources useful to compute the scores"""
        # TODO: Download external resources if needed
        pass

    def _compute(self, predictions, references, execute, filter_func=None, num_executions=100):
        """Returns the valid efficiency score of the predictions compared to the references."""
        # TODO: Compute the different scores of the module
        if len(predictions) != len(references):
            raise ValueError("Predictions and references must have the same number of elements.")
        # Run filter_func on predictions and references if needed
        filtered_predictions = []
        filtered_references = []
        passing_reference_only = 0
        if filter_func is not None:
            for prediction, reference in zip(predictions, references):
                # Only keep if both prediction and reference pass the filter
                if filter_func(prediction) and filter_func(reference):
                    filtered_predictions.append(prediction)
                    filtered_references.append(reference)
                # If only the reference passes the filter, count it
                elif filter_func(reference):
                    passing_reference_only += 1

        # Execute ground truth sql queries to get the time it takes to execute them
        reference_times = np.zeros(num_executions)
        for i in range(num_executions):
            start_time = time()
            [execute(i) for i in filtered_references]
            end_time = time()
            reference_times[i] = end_time - start_time

        # Execute predicted sql queries to get the time it takes to execute them
        prediction_times = np.zeros(num_executions)
        for i in range(num_executions):
            start_time = time()
            [execute(i) for i in filtered_predictions]
            end_time = time()
            prediction_times[i] = end_time - start_time

        # Get mean, std and 3 sigma interval
        reference_mean = np.mean(reference_times)
        reference_std = np.std(reference_times)
        lower_bound = reference_mean - 3 * reference_std
        upper_bound = reference_mean + 3 * reference_std
        # Drop outliers
        filtered_reference_times = reference_times[(reference_times >= lower_bound) & (reference_times <= upper_bound)]

        # Get mean, std and 3 sigma interval
        prediction_mean = np.mean(prediction_times)
        prediction_std = np.std(prediction_times)
        lower_bound = prediction_mean - 3 * prediction_std
        upper_bound = prediction_mean + 3 * prediction_std
        # Drop outliers
        filtered_prediction_times = prediction_times[
            (prediction_times >= lower_bound) & (prediction_times <= upper_bound)]

        # Get new means as e_scores
        reference_mean = np.mean(filtered_reference_times)
        prediction_mean = np.mean(filtered_prediction_times)

        r_value = sqrt(reference_mean / prediction_mean)

        # Run filter_func on predictions and references if needed
        filtered_predictions = []
        filtered_references = []
        divider = 0
        if filter_func is not None:
            for prediction, reference in zip(predictions, references):
                # Only keep if both prediction and reference pass the filter
                pred_bool = filter_func(prediction)
                ref_bool = filter_func(reference)
                if pred_bool and ref_bool:
                    filtered_predictions.append(prediction)
                    filtered_references.append(reference)
                    divider += 1
                # If only the reference passes the filter, count it
                elif pred_bool != ref_bool:
                    divider += 1
        else:
            filtered_predictions = predictions
            filtered_references = references
            divider = len(predictions)
        accuracy = sum(
            execute(i) == execute(j) for i, j in zip(filtered_predictions, filtered_references)) / divider

        return {
            "ves": accuracy * r_value,
        }