import random from umsc import UgMultiScriptConverter import string import epitran from difflib import SequenceMatcher import pandas as pd # # For googletrans 4.0.0-rc1 # import httpcore # setattr(httpcore, 'SyncHTTPTransport', 'AsyncHTTPProxy') # from googletrans import Translator, LANGCODES ## Global Vars # Lists of Uyghur short and long texts short_texts = [ "ياخشىمۇسىز", "تىشلىقمۇ", "بەلكىم", "خەيرلىك كۈن", "خەير خوش", "كەچۈرۈڭ", "رەھمەت", "ئەرزىمەيدۇ", "ياردەملىشىڭ", "توختا", "چۈشەندىم", "ھەئە", "ياق" ] long_texts = [ "مەكتەپكە بارغاندا تېخىمۇ بىلىملىك بولۇمەن.", "يېزا مەنزىرىسى ھەقىقەتەن گۈزەل.", "بىزنىڭ ئۆيدە تۆت تەكچە، تۆتىلىسى تەك-تەكچە", "قىلىچ قان تامغۇزسا، بەگ ئەل ئالىدۇ؛ قەلەمدىن سىياھتانسا، ئالتۇن كېلىدۇ.", "ئۇ بىر كۆزگە كۆرۈنگەن ناخشىچى", "بۇ پۇتبول مۇسابىقىسىنىڭ ئاخىرلىشىشى." ] # Load some more uyghur text to add the long text df = pd.read_csv('uyghur_texts.csv', header=None) long_texts += df.iloc[:, 0].tolist() # # Initialize the translator # translator = Translator() # translation_choices = [L for L in LANGCODES] # Initialize uyghur script converter ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS') ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS') # Initialize Epitran for Uyghur (Arabic script) ipa_converter = epitran.Epitran('uig-Arab') ## Front-End Utils def generate_short_text(script_choice): """Generate a random Uyghur short text based on the type.""" text = random.choice(short_texts) return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text def generate_long_text(script_choice): """Generate a random Uyghur long text based on the type.""" text = random.choice(long_texts) return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text def translate_text(input_text, script_choice, target_language): """ Translate Uyghur text to the target language """ if script_choice == 'Uyghur Latin': input_text = ug_latn_to_arab(input_text) translated_text = translator.translate(input_text, src="ug", dest=LANGCODES[target_language]) return translated_text.text ## ASR Utils def remove_punctuation(text): """Helper function to remove punctuation from text.""" extra_punctuation = "–؛;،؟?«»‹›−—¬”“" # Additional custom uyghur punctuation all_punctuation = string.punctuation + extra_punctuation return text.translate(str.maketrans('', '', all_punctuation)) # def load_and_resample_audio(audio_data, target_rate): # """Load audio and resample based on target sample rate""" # if isinstance(audio_data, tuple): # # microphone # sampling_rate, audio_input = audio_data # audio_input = (audio_input / 32768.0).astype(np.float32) # elif isinstance(audio_data, str): # # file upload # audio_input, sampling_rate = torchaudio.load(audio_data) # else: # return "<>".format(type(audio_data)) # # Resample if needed # if sampling_rate != target_rate: # resampler = torchaudio.transforms.Resample(sampling_rate, target_rate) # audio_input = resampler(audio_input) # return audio_input, target_rate def calculate_pronunciation_accuracy(reference_text, output_text, script_choice): """ Calculate pronunciation accuracy between reference and ASR output text using Epitran. """ # make sure input text is arabic script for IPA conversion if script_choice == 'Uyghur Latin': reference_text = ug_latn_to_arab(reference_text) # Remove punctuation from both texts reference_text_clean = remove_punctuation(reference_text) output_text_clean = remove_punctuation(output_text) # Transliterate both texts to IPA reference_ipa = ipa_converter.transliterate(reference_text_clean) output_ipa = ipa_converter.transliterate(output_text_clean) # Calculate pronunciation accuracy using SequenceMatcher matcher = SequenceMatcher(None, reference_text_clean, output_text_clean) match_ratio = matcher.ratio() # This is the fraction of matching characters # Convert to percentage pronunciation_accuracy = match_ratio * 100 # Convert reference back to original script for feedback output if script_choice == 'Uyghur Latin': reference_text_clean = ug_arab_to_latn(reference_text_clean) # Generate Markdown-compatible styled text comparison_md = "

Pronunciation Feedback

\n" # Small header comparison_md += "
\n" # Add some spacing for opcode, i1, i2, j1, j2 in matcher.get_opcodes(): ref_segment = reference_text_clean[i1:i2] out_segment = output_text_clean[j1:j2] if opcode == 'equal': # Matching characters comparison_md += f'{ref_segment}' elif opcode in ['replace', 'delete', 'insert']: # Mismatched or missing comparison_md += f'{ref_segment}' comparison_md += "
" return reference_ipa, output_ipa, comparison_md, pronunciation_accuracy