"""Levenshtein metric file.""" from __future__ import annotations from typing import TYPE_CHECKING import datasets import evaluate from Levenshtein import distance if TYPE_CHECKING: from collections.abc import Sequence _CITATION = """\ @InProceedings{huggingface:levenshtein, title = {Levenshtein (edit) distance}, authors={Nathan Fradet}, year={2024} } """ _DESCRIPTION = """\ This metrics computes the Levenshtein (edit) distance. It directly calls the "Levenshtein" package using the ``distance`` method: https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance """ _KWARGS_DESCRIPTION = """ This metric computes the Levenshtein distance, also commonly called "edit distance". The Levenshtein distance measures the number of combined editions, deletions and additions to perform on a string so that it becomes identical to a second one. It is a popular metric for text similarity. This module directly calls the [Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution speed. Args: predictions: list of prediction strings. references: list of reference strings. **kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance) method. Returns: Dictionary mapping to the average Levenshtein distance (lower is better) and the ratio ([0, 1]) distance (higher is better). Examples: >>> levenshtein = evaluate.load("Natooz/Levenshtein") >>> results = levenshtein.compute( ... predictions=[ ... "foo", "baroo" ... ], ... references=,[ ... "foo", "bar1" ... ], ... ) >>> print(results) {"levenshtein": 1, "levenshtein_ratio": 0.875} """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class Levenshtein(evaluate.Metric): """Module for the ``distance`` method of the "Levenshtein" package.""" def _info(self) -> evaluate.MetricInfo: """ Return the module info. :return: module info. """ return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features( { "predictions": datasets.Value("string"), "references": datasets.Value("string"), } ), # Homepage of the module for documentation homepage="https://huggingface.co/spaces/Natooz/Levenshtein", # Additional links to the codebase or references codebase_urls=[ "https://github.com/rapidfuzz/Levenshtein", ], reference_urls=[ "https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance" ], ) def _compute( self, predictions: Sequence[float] | None = None, references: Sequence[int] | None = None, **kwargs, ) -> dict[str, float]: """ Return the average Levenshtein (edit) distance. See the "Levenshtein" PyPi package documentation for the complete usage information: https://rapidfuzz.github.io/Levenshtein/ """ if len(predictions) != len(references): msg = "The number of predictions must be equal to the number of references." raise ValueError(msg) # Compute the distances results, ratios = [], [] for prediction, reference in zip(predictions, references): edit_distance = distance(prediction, reference, **kwargs) results.append(edit_distance) ratios.append(edit_distance / (len(prediction) + len(reference))) # Return average distance and ratio return { "levenshtein": sum(results) / len(results), "levenshtein_ratio": 1 - sum(ratios) / len(ratios), }