Spaces:
Running
Running
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Edit distances between Unicode International Phonetic Alphabet strings. | |
This is basically a Hugging Face wrapper around the panphone library's distance module. | |
""" | |
import evaluate | |
import datasets | |
import numpy as np | |
import panphone.distance | |
_CITATION = """\ | |
@inproceedings{Mortensen-et-al:2016, | |
author = {David R. Mortensen and | |
Patrick Littell and | |
Akash Bharadwaj and | |
Kartik Goyal and | |
Chris Dyer and | |
Lori S. Levin}, | |
title = {PanPhon: {A} Resource for Mapping {IPA} Segments to Articulatory Feature Vectors}, | |
booktitle = {Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers}, | |
pages = {3475--3484}, | |
publisher = {{ACL}}, | |
year = {2016} | |
} | |
""" | |
_DESCRIPTION = """\ | |
TODO | |
""" | |
# TODO: Add description of the arguments of the module here | |
_KWARGS_DESCRIPTION = """ | |
TODO | |
Calculates how good are predictions given some references, using certain scores | |
Args: | |
predictions: list of predictions to score. Each predictions | |
should be a string with tokens separated by spaces. | |
references: list of reference for each prediction. Each | |
reference should be a string with tokens separated by spaces. | |
Returns: | |
accuracy: description of the first score, | |
another_score: description of the second score, | |
Examples: | |
Examples should be written in doctest format, and should illustrate how | |
to use the function. | |
>>> my_new_module = evaluate.load("ginic/phone_distance") | |
""" | |
# TODO: Define external resources urls if needed | |
# BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" | |
class PhoneDistance(evaluate.Metric): | |
"""Class for computing distance between Unicode IPA strings """ | |
def _info(self): | |
# TODO: Specifies the evaluate.EvaluationModuleInfo object | |
return evaluate.MetricInfo( | |
# This is the description that will appear on the modules page. | |
module_type="metric", | |
description=_DESCRIPTION, | |
citation=_CITATION, | |
inputs_description=_KWARGS_DESCRIPTION, | |
# This defines the format of each prediction and reference | |
features=datasets.Features({ | |
'predictions': datasets.Value('string', id="sequence"), | |
'references': datasets.Value('string', id="sequence"), | |
}), | |
# Additional links to the codebase or references | |
codebase_urls=["https://github.com/dmort27/panphon", "https://huggingface.co/spaces/ginic/phone_distance/tree/main"], | |
reference_urls=["https://pypi.org/project/panphon/", "https://arxiv.org/abs/2308.03917"] | |
) | |
def _compute(self, predictions:list[str]|None=None, references:list[str]|None=None, feature_set:str="spe+", feature_model:str="segment", is_normalize_max_length:bool=False): | |
"""Computes phoneme error rates, phone feature error rate (Hamming feature edit distance) and feature error rates between prediction and reference strings | |
Args: | |
predictions (list[str], optional): Predicted transcriptions. Defaults to None. | |
references (list[str], optional): Reference transcriptions. Defaults to None. | |
feature_set (str, optional): Feature set to use in the feature model, see panphone documentation for details. Defaults to "spe+". | |
feature_model (str, optional): panphon.distance.Distance feature parsing model to be used, choose from "strict", "permissive", "segment". Defaults to "segment". | |
is_normalize_max_length (bool, optional): Set to true to normalize phone feature error rates by maximum length (measure won't be a true metric). Defaults to False. | |
Returns: | |
_type_: _description_ | |
""" | |
distance_computer = panphone.distance.Distance(feature_set=feature_set, feature_model=feature_model) | |
phoneme_error_rates = [] | |
feature_error_rates = [] | |
hamming_distances = [] | |
for p, r in zip(predictions, references): | |
if is_normalize_max_length: | |
hd = distance_computer.hamming_feature_edit_distance_div_maxlen(p, r) | |
else: | |
hd = distance_computer.hamming_feature_edit_distance(p, r) | |
hamming_distances.append(hd) | |
per = distance_computer.phone_error_rate(p, r) | |
phoneme_error_rates.append(per) | |
fer = distance_computer.feature_error_rate(p, r) | |
feature_error_rates.append(fer) | |
return { | |
"phoneme_error_rates": phoneme_error_rates, | |
"mean_phoneme_error_rate": np.mean(phoneme_error_rates), | |
"phone_feature_error_rates": hamming_distances, | |
"mean_phone_feature_error_rates": np.mean(hamming_distances), | |
"feature_error_rates": feature_error_rates, | |
"mean_feature_error_rates": np.mean(feature_error_rates) | |
} | |