ginic commited on
Commit
a91c31a
·
1 Parent(s): 64ff291

Initial attempt at implementing phone distances

Browse files
Files changed (3) hide show
  1. README.md +0 -2
  2. phone_distance.py +61 -30
  3. requirements.txt +3 -1
README.md CHANGED
@@ -1,7 +1,5 @@
1
  ---
2
  title: Phone Distance
3
- datasets:
4
- -
5
  tags:
6
  - evaluate
7
  - metric
 
1
  ---
2
  title: Phone Distance
 
 
3
  tags:
4
  - evaluate
5
  - metric
phone_distance.py CHANGED
@@ -11,29 +11,40 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
 
 
15
 
16
  import evaluate
17
  import datasets
 
 
18
 
19
 
20
- # TODO: Add BibTeX citation
21
  _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
 
 
 
 
 
 
 
 
26
  }
27
  """
28
 
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
32
  """
33
 
34
 
35
  # TODO: Add description of the arguments of the module here
36
  _KWARGS_DESCRIPTION = """
 
37
  Calculates how good are predictions given some references, using certain scores
38
  Args:
39
  predictions: list of predictions to score. Each predictions
@@ -47,19 +58,16 @@ Examples:
47
  Examples should be written in doctest format, and should illustrate how
48
  to use the function.
49
 
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
- >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
  # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58
 
59
 
60
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
  class PhoneDistance(evaluate.Metric):
62
- """TODO: Short description of my evaluation module."""
63
 
64
  def _info(self):
65
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
@@ -71,25 +79,48 @@ class PhoneDistance(evaluate.Metric):
71
  inputs_description=_KWARGS_DESCRIPTION,
72
  # This defines the format of each prediction and reference
73
  features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
  }),
77
- # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
  # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
 
 
 
 
 
88
 
89
- def _compute(self, predictions, references):
90
- """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
+ """Edit distances between Unicode International Phonetic Alphabet strings.
15
+ This is basically a Hugging Face wrapper around the panphone library's distance module.
16
+ """
17
 
18
  import evaluate
19
  import datasets
20
+ import numpy as np
21
+ import panphone.distance
22
 
23
 
 
24
  _CITATION = """\
25
+ @inproceedings{Mortensen-et-al:2016,
26
+ author = {David R. Mortensen and
27
+ Patrick Littell and
28
+ Akash Bharadwaj and
29
+ Kartik Goyal and
30
+ Chris Dyer and
31
+ Lori S. Levin},
32
+ title = {PanPhon: {A} Resource for Mapping {IPA} Segments to Articulatory Feature Vectors},
33
+ booktitle = {Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers},
34
+ pages = {3475--3484},
35
+ publisher = {{ACL}},
36
+ year = {2016}
37
  }
38
  """
39
 
 
40
  _DESCRIPTION = """\
41
+ TODO
42
  """
43
 
44
 
45
  # TODO: Add description of the arguments of the module here
46
  _KWARGS_DESCRIPTION = """
47
+ TODO
48
  Calculates how good are predictions given some references, using certain scores
49
  Args:
50
  predictions: list of predictions to score. Each predictions
 
58
  Examples should be written in doctest format, and should illustrate how
59
  to use the function.
60
 
61
+ >>> my_new_module = evaluate.load("ginic/phone_distance")
 
 
 
62
  """
63
 
64
  # TODO: Define external resources urls if needed
65
+ # BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
66
 
67
 
68
  @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
69
  class PhoneDistance(evaluate.Metric):
70
+ """Class for computing distance between Unicode IPA strings """
71
 
72
  def _info(self):
73
  # TODO: Specifies the evaluate.EvaluationModuleInfo object
 
79
  inputs_description=_KWARGS_DESCRIPTION,
80
  # This defines the format of each prediction and reference
81
  features=datasets.Features({
82
+ 'predictions': datasets.Value('string', id="sequence"),
83
+ 'references': datasets.Value('string', id="sequence"),
84
  }),
 
 
85
  # Additional links to the codebase or references
86
+ codebase_urls=["https://github.com/dmort27/panphon", "https://huggingface.co/spaces/ginic/phone_distance/tree/main"],
87
+ reference_urls=["https://pypi.org/project/panphon/", "https://arxiv.org/abs/2308.03917"]
88
  )
89
 
90
+ def _compute(self, predictions:list[str]|None=None, references:list[str]|None=None, feature_set:str="spe+", feature_model:str="segment", is_normalize_max_length:bool=False):
91
+ """Computes phoneme error rates, phone feature error rate (Hamming feature edit distance) and feature error rates between prediction and reference strings
92
+
93
+ Args:
94
+ predictions (list[str], optional): Predicted transcriptions. Defaults to None.
95
+ references (list[str], optional): Reference transcriptions. Defaults to None.
96
+ feature_set (str, optional): Feature set to use in the feature model, see panphone documentation for details. Defaults to "spe+".
97
+ feature_model (str, optional): panphon.distance.Distance feature parsing model to be used, choose from "strict", "permissive", "segment". Defaults to "segment".
98
+ is_normalize_max_length (bool, optional): Set to true to normalize phone feature error rates by maximum length (measure won't be a true metric). Defaults to False.
99
 
100
+ Returns:
101
+ _type_: _description_
102
+ """
103
+ distance_computer = panphone.distance.Distance(feature_set=feature_set, feature_model=feature_model)
104
+ phoneme_error_rates = []
105
+ feature_error_rates = []
106
+ hamming_distances = []
107
+ for p, r in zip(predictions, references):
108
+ if is_normalize_max_length:
109
+ hd = distance_computer.hamming_feature_edit_distance_div_maxlen(p, r)
110
+ else:
111
+ hd = distance_computer.hamming_feature_edit_distance(p, r)
112
+ hamming_distances.append(hd)
113
+ per = distance_computer.phone_error_rate(p, r)
114
+ phoneme_error_rates.append(per)
115
+ fer = distance_computer.feature_error_rate(p, r)
116
+ feature_error_rates.append(fer)
117
+
118
  return {
119
+ "phoneme_error_rates": phoneme_error_rates,
120
+ "mean_phoneme_error_rate": np.mean(phoneme_error_rates),
121
+ "phone_feature_error_rates": hamming_distances,
122
+ "mean_phone_feature_error_rates": np.mean(hamming_distances),
123
+ "feature_error_rates": feature_error_rates,
124
+ "mean_feature_error_rates": np.mean(feature_error_rates)
125
+ }
126
+
requirements.txt CHANGED
@@ -1 +1,3 @@
1
- git+https://github.com/huggingface/evaluate@main
 
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ numpy
3
+ panphone==0.20.0