Abdurahman
commited on
Commit
·
f0e249a
1
Parent(s):
f4d4d8e
app
Browse files
asr.py
CHANGED
@@ -2,7 +2,6 @@ import numpy as np
|
|
2 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
3 |
import torch
|
4 |
import torchaudio
|
5 |
-
from umsc import UgMultiScriptConverter
|
6 |
import util
|
7 |
|
8 |
# Model ID and setup
|
@@ -46,16 +45,12 @@ def asr(audio_data, target_rate = 16000):
|
|
46 |
def check_pronunciation(input_text, script, user_audio):
|
47 |
# Transcripts from user input audio
|
48 |
transcript_ugLatn_box = asr(user_audio)
|
49 |
-
|
50 |
-
transcript_ugArab_box = ug_latn_to_arab(transcript_ugLatn_box)
|
51 |
-
|
52 |
-
if script == 'Uyghur Latin':
|
53 |
-
input_text = ug_latn_to_arab(input_text) # make sure input text is arabic script for IPA conversion
|
54 |
|
55 |
# Get IPA and Pronunciation Feedback
|
56 |
machine_pronunciation, user_pronunciation, pronunciation_match, pronunciation_score = util.calculate_pronunciation_accuracy(
|
57 |
reference_text = input_text,
|
58 |
output_text = transcript_ugArab_box,
|
59 |
-
|
60 |
|
61 |
return transcript_ugArab_box, transcript_ugLatn_box, machine_pronunciation, user_pronunciation, pronunciation_match, pronunciation_score
|
|
|
2 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
3 |
import torch
|
4 |
import torchaudio
|
|
|
5 |
import util
|
6 |
|
7 |
# Model ID and setup
|
|
|
45 |
def check_pronunciation(input_text, script, user_audio):
|
46 |
# Transcripts from user input audio
|
47 |
transcript_ugLatn_box = asr(user_audio)
|
48 |
+
transcript_ugArab_box = util.ug_latn_to_arab(transcript_ugLatn_box)
|
|
|
|
|
|
|
|
|
49 |
|
50 |
# Get IPA and Pronunciation Feedback
|
51 |
machine_pronunciation, user_pronunciation, pronunciation_match, pronunciation_score = util.calculate_pronunciation_accuracy(
|
52 |
reference_text = input_text,
|
53 |
output_text = transcript_ugArab_box,
|
54 |
+
script=script)
|
55 |
|
56 |
return transcript_ugArab_box, transcript_ugLatn_box, machine_pronunciation, user_pronunciation, pronunciation_match, pronunciation_score
|
tts.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from transformers import VitsModel, AutoTokenizer
|
2 |
import torch
|
3 |
-
from umsc import UgMultiScriptConverter
|
4 |
import scipy.io.wavfile
|
|
|
5 |
|
6 |
# Model ID and setup
|
7 |
model_id = "facebook/mms-tts-uig-script_arabic"
|
@@ -17,9 +17,8 @@ def generate_audio(input_text, script):
|
|
17 |
Generate audio for the given input text and script
|
18 |
"""
|
19 |
# Convert text to Uyghur Arabic if needed
|
20 |
-
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
|
21 |
if script != "Uyghur Arabic":
|
22 |
-
input_text = ug_latn_to_arab(input_text)
|
23 |
|
24 |
# Tokenize and move inputs to the same device as the model
|
25 |
tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device)
|
|
|
1 |
from transformers import VitsModel, AutoTokenizer
|
2 |
import torch
|
|
|
3 |
import scipy.io.wavfile
|
4 |
+
import util
|
5 |
|
6 |
# Model ID and setup
|
7 |
model_id = "facebook/mms-tts-uig-script_arabic"
|
|
|
17 |
Generate audio for the given input text and script
|
18 |
"""
|
19 |
# Convert text to Uyghur Arabic if needed
|
|
|
20 |
if script != "Uyghur Arabic":
|
21 |
+
input_text = util.ug_latn_to_arab(input_text)
|
22 |
|
23 |
# Tokenize and move inputs to the same device as the model
|
24 |
tts_inputs = tts_tokenizer(input_text, return_tensors="pt").to(device)
|
util.py
CHANGED
@@ -4,6 +4,7 @@ import string
|
|
4 |
import epitran
|
5 |
from difflib import SequenceMatcher
|
6 |
|
|
|
7 |
# Lists of Uyghur short and long texts
|
8 |
short_texts = [
|
9 |
"سالام", "رەھمەت", "ياخشىمۇسىز", "خۇش كېپسىز", "خەيرلىك كۈن", "خەير خوش"
|
@@ -15,8 +16,15 @@ long_texts = [
|
|
15 |
"قىلىچ قان تامغۇزسا، بەگ ئەل ئالىدۇ؛ قەلەمدىن سىياھتانسا، ئالتۇن كېلىدۇ."
|
16 |
]
|
17 |
|
18 |
-
#
|
19 |
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
def generate_short_text(script_choice):
|
21 |
"""Generate a random Uyghur short text based on the type."""
|
22 |
text = random.choice(short_texts)
|
@@ -27,7 +35,15 @@ def generate_long_text(script_choice):
|
|
27 |
text = random.choice(long_texts)
|
28 |
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text
|
29 |
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# def load_and_resample_audio(audio_data, target_rate):
|
32 |
# """Load audio and resample based on target sample rate"""
|
33 |
# if isinstance(audio_data, tuple):
|
@@ -46,7 +62,7 @@ def generate_long_text(script_choice):
|
|
46 |
|
47 |
# return audio_input, target_rate
|
48 |
|
49 |
-
def calculate_pronunciation_accuracy(reference_text, output_text,
|
50 |
"""
|
51 |
Calculate pronunciation accuracy between reference and ASR output text using Epitran.
|
52 |
|
@@ -60,8 +76,10 @@ def calculate_pronunciation_accuracy(reference_text, output_text, language_code=
|
|
60 |
str: IPA transliteration of the reference text.
|
61 |
str: IPA transliteration of the output text.
|
62 |
"""
|
63 |
-
|
64 |
-
|
|
|
|
|
65 |
|
66 |
# Remove punctuation from both texts
|
67 |
reference_text_clean = remove_punctuation(reference_text)
|
@@ -93,9 +111,3 @@ def calculate_pronunciation_accuracy(reference_text, output_text, language_code=
|
|
93 |
|
94 |
return reference_ipa, output_ipa, comparison_md, pronunciation_accuracy
|
95 |
|
96 |
-
def remove_punctuation(text):
|
97 |
-
"""Helper function to remove punctuation from text."""
|
98 |
-
extra_punctuation = "–؛;،؟?«»‹›−—¬”“" # Add your additional custom punctuation from the training set here
|
99 |
-
all_punctuation = string.punctuation + extra_punctuation
|
100 |
-
|
101 |
-
return text.translate(str.maketrans('', '', all_punctuation))
|
|
|
4 |
import epitran
|
5 |
from difflib import SequenceMatcher
|
6 |
|
7 |
+
## Global Vars
|
8 |
# Lists of Uyghur short and long texts
|
9 |
short_texts = [
|
10 |
"سالام", "رەھمەت", "ياخشىمۇسىز", "خۇش كېپسىز", "خەيرلىك كۈن", "خەير خوش"
|
|
|
16 |
"قىلىچ قان تامغۇزسا، بەگ ئەل ئالىدۇ؛ قەلەمدىن سىياھتانسا، ئالتۇن كېلىدۇ."
|
17 |
]
|
18 |
|
19 |
+
# Initialize uyghur script converter
|
20 |
ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
|
21 |
+
ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
|
22 |
+
|
23 |
+
# Initialize Epitran for Uyghur (Arabic script)
|
24 |
+
ipa_converter = epitran.Epitran(language_code='uig-Arab')
|
25 |
+
|
26 |
+
|
27 |
+
## Front-End Utils
|
28 |
def generate_short_text(script_choice):
|
29 |
"""Generate a random Uyghur short text based on the type."""
|
30 |
text = random.choice(short_texts)
|
|
|
35 |
text = random.choice(long_texts)
|
36 |
return ug_arab_to_latn(text) if script_choice == "Uyghur Latin" else text
|
37 |
|
38 |
+
|
39 |
+
## ASR Utils
|
40 |
+
def remove_punctuation(text):
|
41 |
+
"""Helper function to remove punctuation from text."""
|
42 |
+
extra_punctuation = "–؛;،؟?«»‹›−—¬”“" # Add your additional custom punctuation from the training set here
|
43 |
+
all_punctuation = string.punctuation + extra_punctuation
|
44 |
+
|
45 |
+
return text.translate(str.maketrans('', '', all_punctuation))
|
46 |
+
|
47 |
# def load_and_resample_audio(audio_data, target_rate):
|
48 |
# """Load audio and resample based on target sample rate"""
|
49 |
# if isinstance(audio_data, tuple):
|
|
|
62 |
|
63 |
# return audio_input, target_rate
|
64 |
|
65 |
+
def calculate_pronunciation_accuracy(reference_text, output_text, script):
|
66 |
"""
|
67 |
Calculate pronunciation accuracy between reference and ASR output text using Epitran.
|
68 |
|
|
|
76 |
str: IPA transliteration of the reference text.
|
77 |
str: IPA transliteration of the output text.
|
78 |
"""
|
79 |
+
|
80 |
+
|
81 |
+
if script == 'Uyghur Latin':
|
82 |
+
reference_text = ug_latn_to_arab(reference_text) # make sure input text is arabic script for IPA conversion
|
83 |
|
84 |
# Remove punctuation from both texts
|
85 |
reference_text_clean = remove_punctuation(reference_text)
|
|
|
111 |
|
112 |
return reference_ipa, output_ipa, comparison_md, pronunciation_accuracy
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|