# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This is a module to compute the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks as # proposed in "Can LLM Already Serve as a Database Interface? # A Big Bench for Large-Scale Database Grounded Text-to-SQLs" (Li et al., 2023) import evaluate import datasets from time import time import numpy as np from math import sqrt _CITATION = """\ @article{li2023can, title={Can llm already serve as a database interface? a big bench for large-scale database grounded text-to-sqls}, author={Li, Jinyang and Hui, Binyuan and Qu, Ge and Li, Binhua and Yang, Jiaxi and Li, Bowen and Wang, Bailin and Qin, Bowen and Cao, Rongyu and Geng, Ruiying and others}, journal={arXiv preprint arXiv:2305.03111}, year={2023} } """ _DESCRIPTION = """\ This module computes the Valid Efficiency Score (VES) of a model's predictions for text-to-SQL tasks. """ _KWARGS_DESCRIPTION = """ Calculates how good the predictions are given some ground truth sql queries, using the Valid Efficiency Score (VES). Args: predictions: list of predictions to score. Each predictions should be a string with tokens separated by spaces. references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces. execute: function that takes a sql query and returns a result. The result should be a list of tuples, each tuple containing the values of a row. filter_func: function that takes a string and returns a boolean. If True, the string is kept, otherwise it is dropped. num_executions: number of times to execute each sql query to get the execution time. Returns: ves: Valid Efficiency Score of the predictions compared to the references. Examples: >>> my_new_module = evaluate.load("valid_efficiency_score") >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) >>> print(results) {'ves': 1.0} """ # TODO: Define external resources urls if needed BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class ValidEfficiencyScore(evaluate.Metric): """Valid Efficiency Score (VES) metric for text-to-SQL tasks.""" def _info(self): return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ 'predictions': datasets.Value('string'), 'references': datasets.Value('string'), }), # Homepage of the module for documentation homepage="http://module.homepage", # Additional links to the codebase or references codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=[] ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # TODO: Download external resources if needed pass def _compute(self, predictions, references, execute, filter_func=None, num_executions=100): """Returns the valid efficiency score of the predictions compared to the references.""" # TODO: Compute the different scores of the module if len(predictions) != len(references): raise ValueError("Predictions and references must have the same number of elements.") # Run filter_func on predictions and references if needed filtered_predictions = [] filtered_references = [] passing_reference_only = 0 if filter_func is not None: for prediction, reference in zip(predictions, references): # Only keep if both prediction and reference pass the filter if filter_func(prediction) and filter_func(reference): filtered_predictions.append(prediction) filtered_references.append(reference) # If only the reference passes the filter, count it elif filter_func(reference): passing_reference_only += 1 # Execute ground truth sql queries to get the time it takes to execute them reference_times = np.zeros(num_executions) for i in range(num_executions): start_time = time() [execute(i) for i in filtered_references] end_time = time() reference_times[i] = end_time - start_time # Execute predicted sql queries to get the time it takes to execute them prediction_times = np.zeros(num_executions) for i in range(num_executions): start_time = time() [execute(i) for i in filtered_predictions] end_time = time() prediction_times[i] = end_time - start_time # Get mean, std and 3 sigma interval reference_mean = np.mean(reference_times) reference_std = np.std(reference_times) lower_bound = reference_mean - 3 * reference_std upper_bound = reference_mean + 3 * reference_std # Drop outliers filtered_reference_times = reference_times[(reference_times >= lower_bound) & (reference_times <= upper_bound)] # Get mean, std and 3 sigma interval prediction_mean = np.mean(prediction_times) prediction_std = np.std(prediction_times) lower_bound = prediction_mean - 3 * prediction_std upper_bound = prediction_mean + 3 * prediction_std # Drop outliers filtered_prediction_times = prediction_times[ (prediction_times >= lower_bound) & (prediction_times <= upper_bound)] # Get new means as e_scores reference_mean = np.mean(filtered_reference_times) prediction_mean = np.mean(filtered_prediction_times) r_value = sqrt(reference_mean / prediction_mean) # Run filter_func on predictions and references if needed filtered_predictions = [] filtered_references = [] divider = 0 if filter_func is not None: for prediction, reference in zip(predictions, references): # Only keep if both prediction and reference pass the filter pred_bool = filter_func(prediction) ref_bool = filter_func(reference) if pred_bool and ref_bool: filtered_predictions.append(prediction) filtered_references.append(reference) divider += 1 # If only the reference passes the filter, count it elif pred_bool != ref_bool: divider += 1 else: filtered_predictions = predictions filtered_references = references divider = len(predictions) accuracy = sum( execute(i) == execute(j) for i, j in zip(filtered_predictions, filtered_references)) / divider return { "ves": accuracy * r_value, }