Spaces:
Sleeping
Sleeping
"""Levenshtein metric file.""" | |
from __future__ import annotations | |
from typing import TYPE_CHECKING | |
import datasets | |
import evaluate | |
from Levenshtein import distance | |
if TYPE_CHECKING: | |
from collections.abc import Sequence | |
_CITATION = """\ | |
@InProceedings{huggingface:levenshtein, | |
title = {Levenshtein (edit) distance}, | |
authors={Nathan Fradet}, | |
year={2024} | |
} | |
""" | |
_DESCRIPTION = """\ | |
This metrics computes the Levenshtein (edit) distance. | |
It directly calls the "Levenshtein" package using the ``distance`` method: | |
https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance | |
""" | |
_KWARGS_DESCRIPTION = """ | |
This metric computes the Levenshtein distance, also commonly called "edit distance". | |
The Levenshtein distance measures the number of combined editions, deletions and | |
additions to perform on a string so that it becomes identical to a second one. It is a | |
popular metric for text similarity. | |
This module directly calls the | |
[Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution | |
speed. | |
Args: | |
predictions: list of prediction strings. | |
references: list of reference strings. | |
**kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance) | |
method. | |
Returns: | |
Dictionary mapping to the average Levenshtein distance (lower is better) and the | |
ratio ([0, 1]) distance (higher is better). | |
Examples: | |
>>> levenshtein = evaluate.load("Natooz/Levenshtein") | |
>>> results = levenshtein.compute( | |
... predictions=[ | |
... "foo", "baroo" | |
... ], | |
... references=,[ | |
... "foo", "bar1" | |
... ], | |
... ) | |
>>> print(results) | |
{"levenshtein": 1, "levenshtein_ratio": 0.875} | |
""" | |
class Levenshtein(evaluate.Metric): | |
"""Module for the ``distance`` method of the "Levenshtein" package.""" | |
def _info(self) -> evaluate.MetricInfo: | |
""" | |
Return the module info. | |
:return: module info. | |
""" | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features( | |
{ | |
"predictions": datasets.Value("string"), | |
"references": datasets.Value("string"), | |
} | |
), | |
# Homepage of the module for documentation | |
homepage="https://huggingface.co/spaces/Natooz/Levenshtein", | |
# Additional links to the codebase or references | |
codebase_urls=[ | |
"https://github.com/rapidfuzz/Levenshtein", | |
], | |
reference_urls=[ | |
"https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance" | |
], | |
) | |
def _compute( | |
self, | |
predictions: Sequence[float] | None = None, | |
references: Sequence[int] | None = None, | |
**kwargs, | |
) -> dict[str, float]: | |
""" | |
Return the average Levenshtein (edit) distance. | |
See the "Levenshtein" PyPi package documentation for the complete usage | |
information: https://rapidfuzz.github.io/Levenshtein/ | |
""" | |
if len(predictions) != len(references): | |
msg = "The number of predictions must be equal to the number of references." | |
raise ValueError(msg) | |
# Compute the distances | |
results, ratios = [], [] | |
for prediction, reference in zip(predictions, references): | |
edit_distance = distance(prediction, reference, **kwargs) | |
results.append(edit_distance) | |
ratios.append(edit_distance / (len(prediction) + len(reference))) | |
# Return average distance and ratio | |
return { | |
"levenshtein": sum(results) / len(results), | |
"levenshtein_ratio": 1 - sum(ratios) / len(ratios), | |
} | |