Spaces:
Sleeping
Sleeping
Upload 5 files
Browse files- README.md +61 -7
- app.py +40 -0
- levenshtein.py +123 -0
- pyproject.toml +114 -0
- requirements.txt +3 -0
README.md
CHANGED
@@ -1,14 +1,68 @@
|
|
1 |
---
|
2 |
-
title: Levenshtein
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
|
|
|
|
|
|
|
|
6 |
sdk: gradio
|
7 |
sdk_version: 5.6.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
short_description: Levenshtein (edit) distance metric
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
title: Levenshtein distance
|
3 |
+
emoji: ✍️
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: green
|
6 |
+
tags:
|
7 |
+
- evaluate
|
8 |
+
- metric
|
9 |
+
description: Levenshtein (edit) distance
|
10 |
sdk: gradio
|
11 |
sdk_version: 5.6.0
|
12 |
app_file: app.py
|
13 |
pinned: false
|
|
|
|
|
14 |
---
|
15 |
|
16 |
+
# Metric Card for the Levenshtein (edit) distance
|
17 |
+
|
18 |
+
## Metric Description
|
19 |
+
|
20 |
+
This metric computes the Levenshtein distance, also commonly called "edit distance". The Levenshtein distance measures the number of combined editions, deletions and additions to perform on a string so that it becomes identical to a second one. It is a popular metric for text similarity.
|
21 |
+
This module directly calls the [Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution speed.
|
22 |
+
|
23 |
+
## How to Use
|
24 |
+
|
25 |
+
### Inputs
|
26 |
+
|
27 |
+
*List all input arguments in the format below*
|
28 |
+
- **predictions** *(string): sequence of prediction strings*
|
29 |
+
- **references** *(string): sequence of reference string;*
|
30 |
+
- **kwargs** *keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance) method.*
|
31 |
+
|
32 |
+
### Output Values
|
33 |
+
|
34 |
+
Dictionary mapping to the average Levenshtein distance (lower is better) and the ratio ([0, 1]) distance (higher is better).
|
35 |
+
|
36 |
+
### Examples
|
37 |
+
|
38 |
+
```Python
|
39 |
+
import evaluate
|
40 |
+
|
41 |
+
levenshtein = evaluate.load("Natooz/Levenshtein")
|
42 |
+
results = levenshtein.compute(
|
43 |
+
predictions=[
|
44 |
+
"foo", "baroo" # 0 and 2 edits
|
45 |
+
],
|
46 |
+
references=[
|
47 |
+
"foo", "bar"
|
48 |
+
],
|
49 |
+
)
|
50 |
+
print(results)
|
51 |
+
# {"levenshtein": 1, "levenshtein_ratio": 0.875}
|
52 |
+
```
|
53 |
+
|
54 |
+
## Citation
|
55 |
+
|
56 |
+
```bibtex
|
57 |
+
@ARTICLE{1966SPhD...10..707L,
|
58 |
+
author = {{Levenshtein}, V.~I.},
|
59 |
+
title = "{Binary Codes Capable of Correcting Deletions, Insertions and Reversals}",
|
60 |
+
journal = {Soviet Physics Doklady},
|
61 |
+
year = 1966,
|
62 |
+
month = feb,
|
63 |
+
volume = {10},
|
64 |
+
pages = {707},
|
65 |
+
adsurl = {https://ui.adsabs.harvard.edu/abs/1966SPhD...10..707L},
|
66 |
+
adsnote = {Provided by the SAO/NASA Astrophysics Data System}
|
67 |
+
}
|
68 |
+
```
|
app.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Application file."""
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import evaluate
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
module = evaluate.load("Natooz/levenshtein")
|
9 |
+
|
10 |
+
# Code taken and adapted from: https://github.com/huggingface/evaluate/blob/main/src/evaluate/utils/gradio.py
|
11 |
+
local_path = Path(__file__).parent
|
12 |
+
# if there are several input types, use first as default.
|
13 |
+
if isinstance(module.features, list):
|
14 |
+
(feature_names, feature_types) = zip(*module.features[0].items())
|
15 |
+
else:
|
16 |
+
(feature_names, feature_types) = zip(*module.features.items())
|
17 |
+
gradio_input_types = evaluate.utils.infer_gradio_input_types(feature_types)
|
18 |
+
|
19 |
+
|
20 |
+
def compute(data):
|
21 |
+
return module.compute(**evaluate.utils.parse_gradio_data(data, gradio_input_types))
|
22 |
+
|
23 |
+
|
24 |
+
gradio_app = gr.Interface(
|
25 |
+
fn=compute,
|
26 |
+
inputs=gr.Dataframe(
|
27 |
+
headers=feature_names,
|
28 |
+
col_count=len(feature_names),
|
29 |
+
row_count=1,
|
30 |
+
datatype=evaluate.utils.json_to_string_type(gradio_input_types),
|
31 |
+
),
|
32 |
+
outputs=gr.Textbox(label=module.name),
|
33 |
+
description=module.info.description,
|
34 |
+
title=f"Metric: {module.name}",
|
35 |
+
article=evaluate.utils.parse_readme(local_path / "README.md"),
|
36 |
+
)
|
37 |
+
|
38 |
+
|
39 |
+
if __name__ == "__main__":
|
40 |
+
gradio_app.launch()
|
levenshtein.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Levenshtein metric file."""
|
2 |
+
|
3 |
+
from __future__ import annotations
|
4 |
+
|
5 |
+
from typing import TYPE_CHECKING
|
6 |
+
|
7 |
+
import datasets
|
8 |
+
import evaluate
|
9 |
+
|
10 |
+
from Levenshtein import distance
|
11 |
+
|
12 |
+
if TYPE_CHECKING:
|
13 |
+
from collections.abc import Sequence
|
14 |
+
|
15 |
+
_CITATION = """\
|
16 |
+
@InProceedings{huggingface:levenshtein,
|
17 |
+
title = {Levenshtein (edit) distance},
|
18 |
+
authors={Nathan Fradet},
|
19 |
+
year={2024}
|
20 |
+
}
|
21 |
+
"""
|
22 |
+
|
23 |
+
_DESCRIPTION = """\
|
24 |
+
This metrics computes the Levenshtein (edit) distance.
|
25 |
+
It directly calls the "Levenshtein" package using the ``distance`` method:
|
26 |
+
https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance
|
27 |
+
"""
|
28 |
+
|
29 |
+
|
30 |
+
_KWARGS_DESCRIPTION = """
|
31 |
+
This metric computes the Levenshtein distance, also commonly called "edit distance".
|
32 |
+
The Levenshtein distance measures the number of combined editions, deletions and
|
33 |
+
additions to perform on a string so that it becomes identical to a second one. It is a
|
34 |
+
popular metric for text similarity.
|
35 |
+
This module directly calls the
|
36 |
+
[Levenshtein package](https://github.com/rapidfuzz/Levenshtein) for fast execution
|
37 |
+
speed.
|
38 |
+
|
39 |
+
Args:
|
40 |
+
predictions: list of prediction strings.
|
41 |
+
references: list of reference strings.
|
42 |
+
**kwargs: keyword arguments to pass to the [Levenshtein.distance](https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance)
|
43 |
+
method.
|
44 |
+
Returns:
|
45 |
+
Dictionary mapping to the average Levenshtein distance (lower is better) and the
|
46 |
+
ratio ([0, 1]) distance (higher is better).
|
47 |
+
Examples:
|
48 |
+
>>> levenshtein = evaluate.load("Natooz/Levenshtein")
|
49 |
+
>>> results = levenshtein.compute(
|
50 |
+
... predictions=[
|
51 |
+
... "foo", "baroo"
|
52 |
+
... ],
|
53 |
+
... references=,[
|
54 |
+
... "foo", "bar1"
|
55 |
+
... ],
|
56 |
+
... )
|
57 |
+
>>> print(results)
|
58 |
+
{"levenshtein": 1, "levenshtein_ratio": 0.875}
|
59 |
+
"""
|
60 |
+
|
61 |
+
|
62 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
63 |
+
class Levenshtein(evaluate.Metric):
|
64 |
+
"""Module for the ``distance`` method of the "Levenshtein" package."""
|
65 |
+
|
66 |
+
def _info(self) -> evaluate.MetricInfo:
|
67 |
+
"""
|
68 |
+
Return the module info.
|
69 |
+
|
70 |
+
:return: module info.
|
71 |
+
"""
|
72 |
+
return evaluate.MetricInfo(
|
73 |
+
# This is the description that will appear on the modules page.
|
74 |
+
module_type="metric",
|
75 |
+
description=_DESCRIPTION,
|
76 |
+
citation=_CITATION,
|
77 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
78 |
+
# This defines the format of each prediction and reference
|
79 |
+
features=datasets.Features(
|
80 |
+
{
|
81 |
+
"predictions": datasets.Value("string"),
|
82 |
+
"references": datasets.Value("string"),
|
83 |
+
}
|
84 |
+
),
|
85 |
+
# Homepage of the module for documentation
|
86 |
+
homepage="https://huggingface.co/spaces/Natooz/Levenshtein",
|
87 |
+
# Additional links to the codebase or references
|
88 |
+
codebase_urls=[
|
89 |
+
"https://github.com/rapidfuzz/Levenshtein",
|
90 |
+
],
|
91 |
+
reference_urls=[
|
92 |
+
"https://rapidfuzz.github.io/Levenshtein/levenshtein.html#Levenshtein.distance"
|
93 |
+
],
|
94 |
+
)
|
95 |
+
|
96 |
+
def _compute(
|
97 |
+
self,
|
98 |
+
predictions: Sequence[float] | None = None,
|
99 |
+
references: Sequence[int] | None = None,
|
100 |
+
**kwargs,
|
101 |
+
) -> dict[str, float]:
|
102 |
+
"""
|
103 |
+
Return the average Levenshtein (edit) distance.
|
104 |
+
|
105 |
+
See the "Levenshtein" PyPi package documentation for the complete usage
|
106 |
+
information: https://rapidfuzz.github.io/Levenshtein/
|
107 |
+
"""
|
108 |
+
if len(predictions) != len(references):
|
109 |
+
msg = "The number of predictions must be equal to the number of references."
|
110 |
+
raise ValueError(msg)
|
111 |
+
|
112 |
+
# Compute the distances
|
113 |
+
results, ratios = [], []
|
114 |
+
for prediction, reference in zip(predictions, references):
|
115 |
+
edit_distance = distance(prediction, reference, **kwargs)
|
116 |
+
results.append(edit_distance)
|
117 |
+
ratios.append(edit_distance / (len(prediction) + len(reference)))
|
118 |
+
|
119 |
+
# Return average distance and ratio
|
120 |
+
return {
|
121 |
+
"levenshtein": sum(results) / len(results),
|
122 |
+
"levenshtein_ratio": 1 - sum(ratios) / len(ratios),
|
123 |
+
}
|
pyproject.toml
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.ruff]
|
2 |
+
target-version = "py313"
|
3 |
+
|
4 |
+
[tool.ruff.lint]
|
5 |
+
extend-select = [
|
6 |
+
"ARG",
|
7 |
+
"A",
|
8 |
+
"ANN",
|
9 |
+
"B",
|
10 |
+
"BLE",
|
11 |
+
"C4",
|
12 |
+
"COM",
|
13 |
+
"D",
|
14 |
+
"E",
|
15 |
+
"EM",
|
16 |
+
"EXE",
|
17 |
+
"F",
|
18 |
+
"FA",
|
19 |
+
"FBT",
|
20 |
+
"G",
|
21 |
+
"I",
|
22 |
+
"ICN",
|
23 |
+
"INP",
|
24 |
+
"INT",
|
25 |
+
"ISC",
|
26 |
+
"N",
|
27 |
+
"NPY",
|
28 |
+
"PERF",
|
29 |
+
"PGH",
|
30 |
+
"PTH",
|
31 |
+
"PIE",
|
32 |
+
# "PL",
|
33 |
+
"PT",
|
34 |
+
"Q",
|
35 |
+
"RET",
|
36 |
+
"RSE",
|
37 |
+
"RUF",
|
38 |
+
"S",
|
39 |
+
# "SLF",
|
40 |
+
"SIM",
|
41 |
+
"T",
|
42 |
+
"TCH",
|
43 |
+
"TID",
|
44 |
+
"UP",
|
45 |
+
"W",
|
46 |
+
]
|
47 |
+
|
48 |
+
# Each rule exclusion should be explained here.
|
49 |
+
# By default, we think it is better to select groups of rules (above), and exclude
|
50 |
+
# specific problematic rules, instead of selecting specific rules. By doing so, in case
|
51 |
+
# the ruff rules groups change, this requires us to check and handle the new rules or
|
52 |
+
# changes, making sure we stay up to date and keep the best practices.
|
53 |
+
|
54 |
+
# ANN003:
|
55 |
+
# Would mostly apply to args/kwargs that are passed to methods from dependencies, for
|
56 |
+
# which the signature can change depending on the version. This would either be too
|
57 |
+
# difficult to comply and/or would add a lot of noqa exceptions. ANN002 is used as it
|
58 |
+
# adds very few "noqa" exceptions, but ANN003 would add too much complexity.
|
59 |
+
|
60 |
+
# ANN101 and ANN102:
|
61 |
+
# Yields errors for `self` in methods from classes, which is unecessary.
|
62 |
+
# The existence of these rules is currently questioned, they are likely to be removed.
|
63 |
+
# https://github.com/astral-sh/ruff/issues/4396
|
64 |
+
|
65 |
+
# B905
|
66 |
+
# The `strict` keyword argument for the `zip` built-in method appeared with Python
|
67 |
+
# 3.10. As we support previous versions, we cannot comply (yet) with this rule. The
|
68 |
+
# exclusion should be removed when dropping support for Python 3.9.
|
69 |
+
|
70 |
+
# D107
|
71 |
+
# We document classes at the class level (D101). This documentation should cover the
|
72 |
+
# way classes are initialized. So we do not document `__init__` methods.
|
73 |
+
|
74 |
+
# D203
|
75 |
+
# "one-blank-line-before-class", incompatible with D211 (blank-line-before-class).
|
76 |
+
# We follow PEP 257 and other conventions by preferring D211 over D203.
|
77 |
+
|
78 |
+
# D212
|
79 |
+
# "multi-line-summary-first-line", incompatible with D213
|
80 |
+
# (multi-line-summary-second-line).
|
81 |
+
# We follow PEP 257, which recommend to set put the summary line on the second line
|
82 |
+
# after the blank line of the opening quotes.
|
83 |
+
|
84 |
+
# FBT001 and FBT002
|
85 |
+
# Refactoring all the methods to make boolean arguments keyword only would add
|
86 |
+
# complexity and could break code of users. It's ok to have booleans as positional
|
87 |
+
# arguments with default values. For code redability though, we enable FB003.
|
88 |
+
|
89 |
+
# COM812:
|
90 |
+
# Yields errors for one-line portions without comma. Trailing commas are automatically
|
91 |
+
# set with ruff format anyway. This exclusion could be removed when this behavior is
|
92 |
+
# fixed in ruff.
|
93 |
+
|
94 |
+
# UP038
|
95 |
+
# Recommends to | type union with `isinstance`, which is only supported since Python
|
96 |
+
# 3.10. The exclusion should be removed when dropping support for Python 3.9.
|
97 |
+
|
98 |
+
# (ISC001)
|
99 |
+
# May cause conflicts when used with the ruff formatter. They recommend to disable it.
|
100 |
+
# We leave it enabled but keep this in mind.
|
101 |
+
|
102 |
+
ignore = [
|
103 |
+
"ANN003",
|
104 |
+
"ANN101",
|
105 |
+
"ANN102",
|
106 |
+
"B905",
|
107 |
+
"COM812",
|
108 |
+
"D107",
|
109 |
+
"D203",
|
110 |
+
"D212",
|
111 |
+
"FBT001",
|
112 |
+
"FBT002",
|
113 |
+
"UP038",
|
114 |
+
]
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
evaluate>=0.4.0
|
2 |
+
Levenshtein>=0.26.0
|
3 |
+
datasets
|