File size: 5,361 Bytes
734a7ea d29fa84 d42db5f 81cf146 f0e249a 734a7ea 38f4de1 734a7ea d29fa84 734a7ea c0b2049 38f4de1 734a7ea d42db5f 734a7ea d42db5f 8e57d14 f0e249a efdb44e f0e249a 6b61ee0 f0e249a 734a7ea efdb44e 734a7ea efdb44e d29fa84 8e57d14 f0e249a 8e57d14 f0e249a 18f99c6 d29fa84 8e57d14 d29fa84 6eb8cb9 6b61ee0 8e57d14 6b61ee0 d29fa84 43ce49e d29fa84 ff91a06 56a90e5 ff91a06 d29fa84 6eb8cb9 d29fa84 8e57d14 c03de5c 77775b5 ff91a06 c03de5c d29fa84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import random
from umsc import UgMultiScriptConverter
import string
import epitran
from difflib import SequenceMatcher
import pandas as pd
# # For googletrans 4.0.0-rc1
# import httpcore
# setattr(httpcore, 'SyncHTTPTransport', 'AsyncHTTPProxy')
# from googletrans import Translator, LANGCODES
## Global Vars
# Lists of Uyghur short and long texts
short_texts = [
"ياخشىمۇسىز",
"تىشلىقمۇ",
"بەلكىم",
"خەيرلىك كۈن",
"خەير خوش",
"كەچۈرۈڭ",
"رەھمەت",
"ئەرزىمەيدۇ",
"ياردەملىشىڭ",
"توختا",
"چۈشەندىم",
"ھەئە",
"ياق"
]
long_texts = [
"مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.",
"يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.",
"بىزنىڭ ئۆيدە تۆت تەكچە، تۆتىلىسى تەك-تەكچە",
"قىلىچ قان تامغۇزسا، بەگ ئەل ئالىدۇ؛ قەلەمدىن سىياھتانسا، ئالتۇن كېلىدۇ.",
"ئۇ بىر كۆزگە كۆرۈنگەن ناخشىچى",
"بۇ پۇتبول مۇسابىقىسىنىڭ ئاخىرلىشىشى."
]
# Load some more uyghur text to add the long text
df = pd.read_csv('uyghur_texts.csv', header=None)
long_texts += df.iloc[:, 0].tolist()
# # Initialize the translator
# translator = Translator()
# translation_choices = [L for L in LANGCODES]
# Initialize uyghur script converter
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
# Initialize Epitran for Uyghur (Arabic script)
ipa_converter = epitran.Epitran('uig-Arab')
## Front-End Utils
def generate_short_text(script_choice):
"""Generate a random Uyghur short text based on the type."""
text = random.choice(short_texts)
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text
def generate_long_text(script_choice):
"""Generate a random Uyghur long text based on the type."""
text = random.choice(long_texts)
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text
def translate_text(input_text, script_choice, target_language):
"""
Translate Uyghur text to the target language
"""
if script_choice == 'Uyghur Latin':
input_text = ug_latn_to_arab(input_text)
translated_text = translator.translate(input_text, src="ug", dest=LANGCODES[target_language])
return translated_text.text
## ASR Utils
def remove_punctuation(text):
"""Helper function to remove punctuation from text."""
extra_punctuation = "–؛;،؟?«»‹›−—¬”“" # Additional custom uyghur punctuation
all_punctuation = string.punctuation + extra_punctuation
return text.translate(str.maketrans('', '', all_punctuation))
# def load_and_resample_audio(audio_data, target_rate):
# """Load audio and resample based on target sample rate"""
# if isinstance(audio_data, tuple):
# # microphone
# sampling_rate, audio_input = audio_data
# audio_input = (audio_input / 32768.0).astype(np.float32)
# elif isinstance(audio_data, str):
# # file upload
# audio_input, sampling_rate = torchaudio.load(audio_data)
# else:
# return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
# # Resample if needed
# if sampling_rate != target_rate:
# resampler = torchaudio.transforms.Resample(sampling_rate, target_rate)
# audio_input = resampler(audio_input)
# return audio_input, target_rate
def calculate_pronunciation_accuracy(reference_text, output_text, script_choice):
"""
Calculate pronunciation accuracy between reference and ASR output text using Epitran.
"""
# make sure input text is arabic script for IPA conversion
if script_choice == 'Uyghur Latin':
reference_text = ug_latn_to_arab(reference_text)
# Remove punctuation from both texts
reference_text_clean = remove_punctuation(reference_text)
output_text_clean = remove_punctuation(output_text)
# Transliterate both texts to IPA
reference_ipa = ipa_converter.transliterate(reference_text_clean)
output_ipa = ipa_converter.transliterate(output_text_clean)
# Calculate pronunciation accuracy using SequenceMatcher
matcher = SequenceMatcher(None, reference_text_clean, output_text_clean)
match_ratio = matcher.ratio() # This is the fraction of matching characters
# Convert to percentage
pronunciation_accuracy = match_ratio * 100
# Generate Markdown-compatible styled text
comparison_md = "<h4>Pronunciation Feedback (mistakes in red)</h4>\n" # Small header
comparison_md += "<div style='margin-top: 10px;'>\n" # Add some spacing
for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
ref_segment = reference_ipa[i1:i2]
out_segment = output_ipa[j1:j2]
if opcode == 'equal': # Matching characters
comparison_md += f'<span style="color: green; font-size: 20px;">{ref_segment}</span>'
elif opcode in ['replace', 'delete', 'insert']: # Mismatched or missing
comparison_md += f'<span style="color: red; font-size: 20px;">{ref_segment}</span>'
comparison_md += "</div>"
return reference_ipa, output_ipa, comparison_md, pronunciation_accuracy
|