levenshtein / levenshtein.py
Natooz's picture
Upload 5 files
814113b verified
"""Levenshtein metric file."""
from __future__ import annotations
from typing import TYPE_CHECKING
import datasets
import evaluate
from Levenshtein import distance
if TYPE_CHECKING:
from collections.abc import Sequence
_CITATION = """\
@InProceedings{huggingface:levenshtein,
title = {Levenshtein (edit) distance},
authors={Nathan Fradet},
year={2024}
}
"""
_DESCRIPTION = """\
This metrics computes the Levenshtein (edit) distance.
It directly calls the "Levenshtein" package using the ``distance`` method:
https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance
"""
_KWARGS_DESCRIPTION = """
This metric computes the Levenshtein distance, also commonly called "edit distance".
The Levenshtein distance measures the number of combined editions, deletions and
additions to perform on a string so that it becomes identical to a second one. It is a
popular metric for text similarity.
This module directly calls the
[Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution
speed.
Args:
predictions: list of prediction strings.
references: list of reference strings.
**kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance)
method.
Returns:
Dictionary mapping to the average Levenshtein distance (lower is better) and the
ratio ([0, 1]) distance (higher is better).
Examples:
>>> levenshtein = evaluate.load("Natooz/Levenshtein")
>>> results = levenshtein.compute(
... predictions=[
... "foo", "baroo"
... ],
... references=,[
... "foo", "bar1"
... ],
... )
>>> print(results)
{"levenshtein": 1, "levenshtein_ratio": 0.875}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Levenshtein(evaluate.Metric):
"""Module for the ``distance`` method of the "Levenshtein" package."""
def _info(self) -> evaluate.MetricInfo:
"""
Return the module info.
:return: module info.
"""
return evaluate.MetricInfo(
# This is the description that will appear on the modules page.
module_type="metric",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=datasets.Features(
{
"predictions": datasets.Value("string"),
"references": datasets.Value("string"),
}
),
# Homepage of the module for documentation
homepage="https://huggingface.co/spaces/Natooz/Levenshtein",
# Additional links to the codebase or references
codebase_urls=[
"https://github.com/rapidfuzz/Levenshtein",
],
reference_urls=[
"https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance"
],
)
def _compute(
self,
predictions: Sequence[float] | None = None,
references: Sequence[int] | None = None,
**kwargs,
) -> dict[str, float]:
"""
Return the average Levenshtein (edit) distance.
See the "Levenshtein" PyPi package documentation for the complete usage
information: https://rapidfuzz.github.io/Levenshtein/
"""
if len(predictions) != len(references):
msg = "The number of predictions must be equal to the number of references."
raise ValueError(msg)
# Compute the distances
results, ratios = [], []
for prediction, reference in zip(predictions, references):
edit_distance = distance(prediction, reference, **kwargs)
results.append(edit_distance)
ratios.append(edit_distance / (len(prediction) + len(reference)))
# Return average distance and ratio
return {
"levenshtein": sum(results) / len(results),
"levenshtein_ratio": 1 - sum(ratios) / len(ratios),
}