Spaces:

Yehor
/

see-asr-outputs

Sleeping

App Files Files Community

Yehor commited on Mar 26

Commit

ff99877

1 Parent(s): 07b19d7

Add Levenshtein distance for faster debugging

Browse files

Files changed (4) hide show

.gitignore +1 -0
README.md +2 -0
app.py +68 -69
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -1,5 +1,6 @@
 .idea/
 .venv/
 .ruff_cache/
 flagged/

 .idea/
 .venv/
 .ruff_cache/
+__pycache__/
 flagged/

README.md CHANGED Viewed

@@ -23,6 +23,8 @@ uv pip install -r requirements-dev.txt
 ## Development
 ```shell
 gradio app.py
 ```

 ## Development
+Run app:
 ```shell
 gradio app.py
 ```

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ from importlib.metadata import version
 import evaluate
 import polars as pl
 import gradio as gr
 # Load evaluators
@@ -29,8 +30,8 @@ Follow them on social networks and **contact** if you need any help or have any
 """.strip()
 examples = [
-    ["evaluation_results.jsonl", False, True],
-    ["evaluation_results_batch.jsonl", True, True],
 ]
 description_head = f"""
@@ -63,6 +64,7 @@ tech_libraries = f"""
 - evaluate: {version("evaluate")}
 - pandas: {version("pandas")}
 - polars: {version("polars")}
 """.strip()
@@ -74,15 +76,7 @@ def compute_cer(prediction, reference):
     return round(cer.compute(predictions=[prediction], references=[reference]), 4)
-def compute_batch_wer(predictions, references):
-    return round(wer.compute(predictions=predictions, references=references), 4)
-def compute_batch_cer(predictions, references):
-    return round(cer.compute(predictions=predictions, references=references), 4)
-def inference(file_name, _batch_mode, _calculate_metrics):
     if not file_name:
         raise gr.Error("Please paste your JSON file.")
@@ -120,67 +114,67 @@ def inference(file_name, _batch_mode, _calculate_metrics):
     # exclude inference_start, inference_end
     if _batch_mode:
-        df = df.drop(["inference_start", "inference_end", "filenames"])
     else:
-        df = df.drop(["inference_start", "inference_end", "filename"])
-    # round "inference_total" field to 2 decimal places
-    df = df.with_columns(pl.col("inference_total").round(2).alias("elapsed"))
-    df = df.drop(["inference_total"])
-    df_pd = df.to_pandas()
-    # reassign columns
-    if _batch_mode:
-        if _calculate_metrics:
-            df_pd["wer"] = df_pd.apply(
-                lambda row: compute_batch_wer(row["predictions"], row["references"]), axis=1,
-            )
-            df_pd["cer"] = df_pd.apply(
-                lambda row: compute_batch_cer(row["predictions"], row["references"]), axis=1,
-            )
-            fields = [
-                "elapsed",
-                "durations",
-                "wer",
-                "cer",
-                "predictions",
-                "references",
-            ]
-        else:
-            fields = [
-                "elapsed",
-                "durations",
-                "predictions",
-                "references",
-            ]
-    else:
-        if _calculate_metrics:
-            df_pd["wer"] = df_pd.apply(
-                lambda row: compute_wer(row["prediction"], row["reference"]), axis=1,
-            )
-            df_pd["cer"] = df_pd.apply(
-                lambda row: compute_cer(row["prediction"], row["reference"]), axis=1,
-            )
-            fields = [
-                "elapsed",
-                "duration",
-                "wer",
-                "cer",
-                "prediction",
-                "reference",
-            ]
-        else:
-            fields = [
-                "elapsed",
-                "duration",
-                "prediction",
-                "reference",
-            ]
-    df = pl.DataFrame(df_pd)
     return df.select(fields)
@@ -212,21 +206,26 @@ with demo:
                 label="Use batch mode",
             )
             calculate_metrics = gr.Checkbox(
                 label="Calculate WER/CER metrics",
                 value=False,
             )
     gr.Button("Show").click(
-        inference,
-        inputs=[jsonl_file, batch_mode, calculate_metrics],
         outputs=df,
     )
     with gr.Row():
         gr.Examples(
             label="Choose an example",
-            inputs=[jsonl_file, batch_mode, calculate_metrics],
             examples=examples,
         )

 import evaluate
 import polars as pl
+import polars_distance as pld
 import gradio as gr
 # Load evaluators
 """.strip()
 examples = [
+    ["evaluation_results.jsonl", False, True, False],
+    ["evaluation_results_batch.jsonl", True, False, False],
 ]
 description_head = f"""
 - evaluate: {version("evaluate")}
 - pandas: {version("pandas")}
 - polars: {version("polars")}
+- polars_distance: {version("polars_distance")}
 """.strip()
     return round(cer.compute(predictions=[prediction], references=[reference]), 4)
+def process_file(file_name, _batch_mode, _calculate_distance, _calculate_metrics):
     if not file_name:
         raise gr.Error("Please paste your JSON file.")
     # exclude inference_start, inference_end
     if _batch_mode:
+        df = df.drop(
+            ["inference_total", "inference_start", "inference_end", "filenames"]
+        )
     else:
+        df = df.drop(
+            ["inference_total", "inference_start", "inference_end", "filename"]
+        )
+    if _batch_mode:
+        predictions = []
+        references = []
+        for row in df.iter_rows(named=True):
+            for idx, prediction in enumerate(row["predictions"]):
+                reference = row["references"][idx]
+                predictions.append(prediction)
+                references.append(reference)
+        df = pl.DataFrame(
+            {
+                "prediction": predictions,
+                "reference": references,
+            }
+        )
+    if _calculate_metrics:
+        # Pandas is needed for applying functions
+        df_pd = df.to_pandas()
+        df_pd["wer"] = df_pd.apply(
+            lambda row: compute_wer(row["prediction"], row["reference"]),
+            axis=1,
+        )
+        df_pd["cer"] = df_pd.apply(
+            lambda row: compute_cer(row["prediction"], row["reference"]),
+            axis=1,
+        )
+        fields = [
+            "wer",
+            "cer",
+            "prediction",
+            "reference",
+        ]
+        df = pl.DataFrame(df_pd)
+    elif _calculate_distance:
+        df = df.with_columns(
+            pld.col("prediction").dist_str.levenshtein("reference").alias("distance")
+        )
+        fields = [
+            "distance",
+            "prediction",
+            "reference",
+        ]
+    else:
+        fields = [
+            "prediction",
+            "reference",
+        ]
     return df.select(fields)
                 label="Use batch mode",
             )
+            calculate_distance = gr.Checkbox(
+                label="Calculate Levenshtein distance",
+                value=False,
+            )
             calculate_metrics = gr.Checkbox(
                 label="Calculate WER/CER metrics",
                 value=False,
             )
     gr.Button("Show").click(
+        process_file,
+        inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
         outputs=df,
     )
     with gr.Row():
         gr.Examples(
             label="Choose an example",
+            inputs=[jsonl_file, batch_mode, calculate_distance, calculate_metrics],
             examples=examples,
         )

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 gradio==5.23.0
-polars==1.26.0
 evaluate==0.4.3
 jiwer==3.1.0

 gradio==5.23.0
+polars==1.25.2
+polars-distance==0.5.2
 evaluate==0.4.3
 jiwer==3.1.0