Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,11 +2,13 @@ import gradio as gr
|
|
2 |
from gtts import gTTS
|
3 |
import tempfile
|
4 |
import os
|
5 |
-
import difflib
|
6 |
import torch
|
7 |
import re
|
8 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
9 |
import torchaudio
|
|
|
|
|
|
|
10 |
|
11 |
# Load AI4Bharat Hindi model & processor
|
12 |
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
|
@@ -17,13 +19,52 @@ def play_text(text):
|
|
17 |
tts = gTTS(text=text, lang='hi', slow=False)
|
18 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
|
19 |
tts.save(temp_file.name)
|
20 |
-
|
21 |
-
os.system(f"start {temp_file.name}")
|
22 |
return "✅ Text is being read out. Please listen and read it yourself."
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def transcribe_audio(audio_path, original_text):
|
25 |
try:
|
26 |
-
# 1. Load
|
27 |
waveform, sample_rate = torchaudio.load(audio_path)
|
28 |
if waveform.shape[0] > 1:
|
29 |
waveform = waveform.mean(dim=0, keepdim=True)
|
@@ -31,39 +72,37 @@ def transcribe_audio(audio_path, original_text):
|
|
31 |
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
32 |
waveform = transform(waveform)
|
33 |
|
34 |
-
#
|
35 |
-
GAIN = 1.5
|
36 |
waveform = waveform * GAIN
|
37 |
-
waveform = torch.clamp(waveform, -1.0, 1.0)
|
38 |
|
39 |
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
|
40 |
-
|
41 |
# 2. Transcribe with AI4Bharat model
|
42 |
with torch.no_grad():
|
43 |
logits = model(input_values).logits
|
44 |
predicted_ids = torch.argmax(logits, dim=-1)
|
45 |
transcription = processor.decode(predicted_ids[0])
|
46 |
|
47 |
-
# 3.
|
48 |
-
|
49 |
-
|
50 |
-
matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
|
51 |
-
accuracy = round(matcher.ratio() * 100, 2)
|
52 |
|
53 |
-
# Speaking speed
|
|
|
54 |
duration = waveform.shape[1] / 16000
|
55 |
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
|
56 |
|
57 |
result = {
|
58 |
"📝 Transcribed Text": transcription,
|
59 |
-
"
|
60 |
-
"⏱️ Speaking Speed (words/sec)": speed
|
61 |
}
|
62 |
-
|
63 |
-
|
64 |
-
return {"error": str(e)}
|
65 |
|
66 |
-
|
|
|
67 |
|
68 |
with gr.Blocks() as app:
|
69 |
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
|
@@ -79,7 +118,12 @@ with gr.Blocks() as app:
|
|
79 |
|
80 |
submit_button = gr.Button("✅ Submit Recording for Checking")
|
81 |
output = gr.JSON(label="Results")
|
|
|
82 |
|
83 |
-
submit_button.click(
|
|
|
|
|
|
|
|
|
84 |
|
85 |
app.launch()
|
|
|
2 |
from gtts import gTTS
|
3 |
import tempfile
|
4 |
import os
|
|
|
5 |
import torch
|
6 |
import re
|
7 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
|
8 |
import torchaudio
|
9 |
+
import difflib
|
10 |
+
import pandas as pd
|
11 |
+
from Levenshtein import distance as lev_distance
|
12 |
|
13 |
# Load AI4Bharat Hindi model & processor
|
14 |
MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
|
|
|
19 |
tts = gTTS(text=text, lang='hi', slow=False)
|
20 |
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
|
21 |
tts.save(temp_file.name)
|
22 |
+
os.system(f"start {temp_file.name}") # Windows only
|
|
|
23 |
return "✅ Text is being read out. Please listen and read it yourself."
|
24 |
|
25 |
+
def get_error_type(asr_word, correct_word):
|
26 |
+
# Both words missing or extra
|
27 |
+
if not asr_word:
|
28 |
+
return "Missing word"
|
29 |
+
if not correct_word:
|
30 |
+
return "Extra word"
|
31 |
+
# Spelling error: small Levenshtein
|
32 |
+
if lev_distance(asr_word, correct_word) <= 2:
|
33 |
+
return "Spelling mistake"
|
34 |
+
# Matra/phonetic error: shared chars but wrong form
|
35 |
+
set1, set2 = set(asr_word), set(correct_word)
|
36 |
+
if set1 & set2:
|
37 |
+
return "Phonetic/Matra error"
|
38 |
+
return "Substitution/Distorted"
|
39 |
+
|
40 |
+
def compare_hindi_sentences(expected, transcribed):
|
41 |
+
# Split by whitespace for Hindi
|
42 |
+
expected_words = expected.strip().split()
|
43 |
+
transcribed_words = transcribed.strip().split()
|
44 |
+
|
45 |
+
matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
|
46 |
+
errors = []
|
47 |
+
|
48 |
+
for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
|
49 |
+
if opcode == "equal":
|
50 |
+
continue
|
51 |
+
elif opcode == "replace":
|
52 |
+
for k in range(max(i2 - i1, j2 - j1)):
|
53 |
+
asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
|
54 |
+
correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
|
55 |
+
error_type = get_error_type(asr_word, correct_word)
|
56 |
+
errors.append((asr_word, correct_word, error_type))
|
57 |
+
elif opcode == "insert":
|
58 |
+
for k in range(j1, j2):
|
59 |
+
errors.append(("", expected_words[k], "Missing word"))
|
60 |
+
elif opcode == "delete":
|
61 |
+
for k in range(i1, i2):
|
62 |
+
errors.append((transcribed_words[k], "", "Extra word"))
|
63 |
+
return errors
|
64 |
+
|
65 |
def transcribe_audio(audio_path, original_text):
|
66 |
try:
|
67 |
+
# 1. Load and pre-process audio
|
68 |
waveform, sample_rate = torchaudio.load(audio_path)
|
69 |
if waveform.shape[0] > 1:
|
70 |
waveform = waveform.mean(dim=0, keepdim=True)
|
|
|
72 |
transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
73 |
waveform = transform(waveform)
|
74 |
|
75 |
+
# Amplify voice intensity
|
76 |
+
GAIN = 1.5
|
77 |
waveform = waveform * GAIN
|
78 |
+
waveform = torch.clamp(waveform, -1.0, 1.0)
|
79 |
|
80 |
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
|
81 |
+
|
82 |
# 2. Transcribe with AI4Bharat model
|
83 |
with torch.no_grad():
|
84 |
logits = model(input_values).logits
|
85 |
predicted_ids = torch.argmax(logits, dim=-1)
|
86 |
transcription = processor.decode(predicted_ids[0])
|
87 |
|
88 |
+
# 3. Error analysis (as table)
|
89 |
+
errors = compare_hindi_sentences(original_text, transcription)
|
90 |
+
df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
|
|
|
|
|
91 |
|
92 |
+
# Speaking speed
|
93 |
+
transcribed_words = transcription.strip().split()
|
94 |
duration = waveform.shape[1] / 16000
|
95 |
speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
|
96 |
|
97 |
result = {
|
98 |
"📝 Transcribed Text": transcription,
|
99 |
+
"⏱️ Speaking Speed (words/sec)": speed,
|
|
|
100 |
}
|
101 |
+
# Return table as a separate output (Gradio Dataframe)
|
102 |
+
return result, df_errors
|
|
|
103 |
|
104 |
+
except Exception as e:
|
105 |
+
return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
|
106 |
|
107 |
with gr.Blocks() as app:
|
108 |
gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
|
|
|
118 |
|
119 |
submit_button = gr.Button("✅ Submit Recording for Checking")
|
120 |
output = gr.JSON(label="Results")
|
121 |
+
error_table = gr.Dataframe(headers=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"], label="गलती तालिका (Error Table)")
|
122 |
|
123 |
+
submit_button.click(
|
124 |
+
transcribe_audio,
|
125 |
+
inputs=[audio_input, input_text],
|
126 |
+
outputs=[output, error_table]
|
127 |
+
)
|
128 |
|
129 |
app.launch()
|