Harveenchadha
/

hindi_model_with_lm_vakyansh

@@ -1,14 +1,22 @@
 #!/usr/bin/env python3
 import argparse
 import re
 from typing import Dict
-import torch
 from datasets import Audio, Dataset, load_dataset, load_metric
 from transformers import AutoFeatureExtractor, pipeline
 def log_results(result: Dataset, args: Dict[str, str]):
     """DO NOT CHANGE. This function computes and logs the result metrics."""
@@ -50,9 +58,9 @@ def log_results(result: Dataset, args: Dict[str, str]):
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
-    chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
-    text = re.sub(chars_to_ignore_regex, "", text.lower())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     # note that order is important here!
@@ -63,6 +71,12 @@ def normalize_text(text: str) -> str:
     return text
 def main(args):
     # load dataset
@@ -79,9 +93,7 @@ def main(args):
     dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
     # load eval pipeline
-    if args.device is None:
-        args.device = 0 if torch.cuda.is_available() else -1
-    asr = pipeline("automatic-speech-recognition", model=args.model_id, device=args.device)
     # map function to decode audio
     def map_to_pred(batch):
@@ -89,8 +101,10 @@ def main(args):
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
-        batch["prediction"] = prediction["text"].replace('<s>','')
         batch["target"] = normalize_text(batch["sentence"])
         return batch
     # run inference on all examples
@@ -126,12 +140,6 @@ if __name__ == "__main__":
     parser.add_argument(
         "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
     )
-    parser.add_argument(
-        "--device",
-        type=int,
-        default=None,
-        help="The device to run the pipeline on. -1 for CPU (default), 0 for the first GPU and so on.",
-    )
     args = parser.parse_args()
     main(args)

 #!/usr/bin/env python3
+#pip install indic-nlp-library
+from indicnlp.tokenize.indic_tokenize import trivial_tokenize
+from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
 import argparse
 import re
 from typing import Dict
 from datasets import Audio, Dataset, load_dataset, load_metric
 from transformers import AutoFeatureExtractor, pipeline
+indic_normalizer_factory = IndicNormalizerFactory()
+indic_normalizer = indic_normalizer_factory.get_normalizer('hi')
 def log_results(result: Dataset, args: Dict[str, str]):
     """DO NOT CHANGE. This function computes and logs the result metrics."""
 def normalize_text(text: str) -> str:
     """DO ADAPT FOR YOUR USE CASE. this function normalizes the target text."""
+    chars_to_ignore_regex = '[।,?.!\-\;\:"“%‘”�—’…–]'  # noqa: W605 IMPORTANT: this should correspond to the chars that were ignored during training
+    text = re.sub(chars_to_ignore_regex, "", text.lower().strip())
     # In addition, we can normalize the target text, e.g. removing new lines characters etc...
     # note that order is important here!
     return text
+def normalize_text_indic(text:str) -> str:
+    lang='hi'
+    normalized = indic_normalizer.normalize(text)
+    processed = ' '.join(trivial_tokenize(normalized, lang))
+    return processed
 def main(args):
     # load dataset
     dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
     # load eval pipeline
+    asr = pipeline("automatic-speech-recognition", model=args.model_id)
     # map function to decode audio
     def map_to_pred(batch):
             batch["audio"]["array"], chunk_length_s=args.chunk_length_s, stride_length_s=args.stride_length_s
         )
+        batch["prediction"] = prediction["text"]
         batch["target"] = normalize_text(batch["sentence"])
+        batch["prediction"] = normalize_text_indic(batch["prediction"] )
+        batch["target"] = normalize_text_indic(batch["target"] )
         return batch
     # run inference on all examples
     parser.add_argument(
         "--log_outputs", action="store_true", help="If defined, write outputs to log file for analysis."
     )
     args = parser.parse_args()
     main(args)