Spaces:

ginic
/

phone_errors

Sleeping

App Files Files Community

ginic commited on Mar 19, 2024

Commit

a91c31a

1 Parent(s): 64ff291

Initial attempt at implementing phone distances

Browse files

Files changed (3) hide show

README.md +0 -2
phone_distance.py +61 -30
requirements.txt +3 -1

README.md CHANGED Viewed

@@ -1,7 +1,5 @@
 ---
 title: Phone Distance
-datasets:
--
 tags:
 - evaluate
 - metric

 ---
 title: Phone Distance
 tags:
 - evaluate
 - metric

phone_distance.py CHANGED Viewed

@@ -11,29 +11,40 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""TODO: Add a description here."""
 import evaluate
 import datasets
-# TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
-# TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
     predictions: list of predictions to score. Each predictions
@@ -47,19 +58,16 @@ Examples:
     Examples should be written in doctest format, and should illustrate how
     to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
-    >>> print(results)
-    {'accuracy': 1.0}
 """
 # TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class PhoneDistance(evaluate.Metric):
-    """TODO: Short description of my evaluation module."""
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -71,25 +79,48 @@ class PhoneDistance(evaluate.Metric):
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
             }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
             # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
-    def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
         return {
-            "accuracy": accuracy,
-        }

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Edit distances between Unicode International Phonetic Alphabet strings.
+This is basically a Hugging Face wrapper around the panphone library's distance module.
+"""
 import evaluate
 import datasets
+import numpy as np
+import panphone.distance
 _CITATION = """\
+@inproceedings{Mortensen-et-al:2016,
+  author    = {David R. Mortensen and
+               Patrick Littell and
+               Akash Bharadwaj and
+               Kartik Goyal and
+               Chris Dyer and
+               Lori S. Levin},
+  title     = {PanPhon: {A} Resource for Mapping {IPA} Segments to Articulatory Feature Vectors},
+  booktitle = {Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
+  pages     = {3475--3484},
+  publisher = {{ACL}},
+  year      = {2016}
 }
 """
 _DESCRIPTION = """\
+TODO
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
+TODO
 Calculates how good are predictions given some references, using certain scores
 Args:
     predictions: list of predictions to score. Each predictions
     Examples should be written in doctest format, and should illustrate how
     to use the function.
+    >>> my_new_module = evaluate.load("ginic/phone_distance")
 """
 # TODO: Define external resources urls if needed
+# BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class PhoneDistance(evaluate.Metric):
+    """Class for computing distance between Unicode IPA strings """
     def _info(self):
         # TODO: Specifies the evaluate.EvaluationModuleInfo object
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
             features=datasets.Features({
+                'predictions': datasets.Value('string', id="sequence"),
+                'references': datasets.Value('string', id="sequence"),
             }),
             # Additional links to the codebase or references
+            codebase_urls=["https://github.com/dmort27/panphon", "https://huggingface.co/spaces/ginic/phone_distance/tree/main"],
+            reference_urls=["https://pypi.org/project/panphon/", "https://arxiv.org/abs/2308.03917"]
         )
+    def _compute(self, predictions:list[str]|None=None, references:list[str]|None=None, feature_set:str="spe+", feature_model:str="segment", is_normalize_max_length:bool=False):
+        """Computes phoneme error rates, phone feature error rate (Hamming feature edit distance) and feature error rates between prediction and reference strings
+        Args:
+            predictions (list[str], optional): Predicted transcriptions. Defaults to None.
+            references (list[str], optional): Reference transcriptions. Defaults to None.
+            feature_set (str, optional): Feature set to use in the feature model, see panphone documentation for details. Defaults to "spe+".
+            feature_model (str, optional): panphon.distance.Distance feature parsing model to be used, choose from "strict", "permissive", "segment". Defaults to "segment".
+            is_normalize_max_length (bool, optional): Set to true to normalize phone feature error rates by maximum length (measure won't be a true metric). Defaults to False.
+        Returns:
+            _type_: _description_
+        """
+        distance_computer = panphone.distance.Distance(feature_set=feature_set, feature_model=feature_model)
+        phoneme_error_rates = []
+        feature_error_rates = []
+        hamming_distances = []
+        for p, r in zip(predictions, references):
+            if is_normalize_max_length:
+                hd = distance_computer.hamming_feature_edit_distance_div_maxlen(p, r)
+            else:
+                hd = distance_computer.hamming_feature_edit_distance(p, r)
+            hamming_distances.append(hd)
+            per = distance_computer.phone_error_rate(p, r)
+            phoneme_error_rates.append(per)
+            fer = distance_computer.feature_error_rate(p, r)
+            feature_error_rates.append(fer)
         return {
+            "phoneme_error_rates": phoneme_error_rates,
+            "mean_phoneme_error_rate": np.mean(phoneme_error_rates),
+            "phone_feature_error_rates": hamming_distances,
+            "mean_phone_feature_error_rates": np.mean(hamming_distances),
+            "feature_error_rates": feature_error_rates,
+            "mean_feature_error_rates": np.mean(feature_error_rates)
+        }

requirements.txt CHANGED Viewed

	@@ -1 +1,3 @@
1	- git+https://github.com/huggingface/evaluate@main

+git+https://github.com/huggingface/evaluate@main
+numpy
+panphone==0.20.0