NLPV commited on
Commit
ba18501
·
verified ·
1 Parent(s): a59a577

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -21
app.py CHANGED
@@ -2,11 +2,13 @@ import gradio as gr
2
  from gtts import gTTS
3
  import tempfile
4
  import os
5
- import difflib
6
  import torch
7
  import re
8
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
9
  import torchaudio
 
 
 
10
 
11
  # Load AI4Bharat Hindi model & processor
12
  MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
@@ -17,13 +19,52 @@ def play_text(text):
17
  tts = gTTS(text=text, lang='hi', slow=False)
18
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
19
  tts.save(temp_file.name)
20
- # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
21
- os.system(f"start {temp_file.name}")
22
  return "✅ Text is being read out. Please listen and read it yourself."
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def transcribe_audio(audio_path, original_text):
25
  try:
26
- # 1. Load audio & convert to mono, 16kHz if needed
27
  waveform, sample_rate = torchaudio.load(audio_path)
28
  if waveform.shape[0] > 1:
29
  waveform = waveform.mean(dim=0, keepdim=True)
@@ -31,39 +72,37 @@ def transcribe_audio(audio_path, original_text):
31
  transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
32
  waveform = transform(waveform)
33
 
34
- # --- Amplify voice intensity here ---
35
- GAIN = 1.5 # You can adjust this value (1.0 = unchanged, 2.0 = double)
36
  waveform = waveform * GAIN
37
- waveform = torch.clamp(waveform, -1.0, 1.0) # Avoid clipping/distortion
38
 
39
  input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
40
-
41
  # 2. Transcribe with AI4Bharat model
42
  with torch.no_grad():
43
  logits = model(input_values).logits
44
  predicted_ids = torch.argmax(logits, dim=-1)
45
  transcription = processor.decode(predicted_ids[0])
46
 
47
- # 3. Calculate accuracy etc.
48
- original_words = re.findall(r'\w+', original_text.strip())
49
- transcribed_words = re.findall(r'\w+', transcription.strip())
50
- matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
51
- accuracy = round(matcher.ratio() * 100, 2)
52
 
53
- # Speaking speed approximation (needs duration, which torchaudio gives)
 
54
  duration = waveform.shape[1] / 16000
55
  speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
56
 
57
  result = {
58
  "📝 Transcribed Text": transcription,
59
- "🎯 Accuracy (%)": accuracy,
60
- "⏱️ Speaking Speed (words/sec)": speed
61
  }
62
- return result
63
- except Exception as e:
64
- return {"error": str(e)}
65
 
66
- return {"error": str(e)}
 
67
 
68
  with gr.Blocks() as app:
69
  gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
@@ -79,7 +118,12 @@ with gr.Blocks() as app:
79
 
80
  submit_button = gr.Button("✅ Submit Recording for Checking")
81
  output = gr.JSON(label="Results")
 
82
 
83
- submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
 
 
 
 
84
 
85
  app.launch()
 
2
  from gtts import gTTS
3
  import tempfile
4
  import os
 
5
  import torch
6
  import re
7
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
8
  import torchaudio
9
+ import difflib
10
+ import pandas as pd
11
+ from Levenshtein import distance as lev_distance
12
 
13
  # Load AI4Bharat Hindi model & processor
14
  MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
 
19
  tts = gTTS(text=text, lang='hi', slow=False)
20
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
21
  tts.save(temp_file.name)
22
+ os.system(f"start {temp_file.name}") # Windows only
 
23
  return "✅ Text is being read out. Please listen and read it yourself."
24
 
25
+ def get_error_type(asr_word, correct_word):
26
+ # Both words missing or extra
27
+ if not asr_word:
28
+ return "Missing word"
29
+ if not correct_word:
30
+ return "Extra word"
31
+ # Spelling error: small Levenshtein
32
+ if lev_distance(asr_word, correct_word) <= 2:
33
+ return "Spelling mistake"
34
+ # Matra/phonetic error: shared chars but wrong form
35
+ set1, set2 = set(asr_word), set(correct_word)
36
+ if set1 & set2:
37
+ return "Phonetic/Matra error"
38
+ return "Substitution/Distorted"
39
+
40
+ def compare_hindi_sentences(expected, transcribed):
41
+ # Split by whitespace for Hindi
42
+ expected_words = expected.strip().split()
43
+ transcribed_words = transcribed.strip().split()
44
+
45
+ matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
46
+ errors = []
47
+
48
+ for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
49
+ if opcode == "equal":
50
+ continue
51
+ elif opcode == "replace":
52
+ for k in range(max(i2 - i1, j2 - j1)):
53
+ asr_word = transcribed_words[i1 + k] if i1 + k < i2 else ""
54
+ correct_word = expected_words[j1 + k] if j1 + k < j2 else ""
55
+ error_type = get_error_type(asr_word, correct_word)
56
+ errors.append((asr_word, correct_word, error_type))
57
+ elif opcode == "insert":
58
+ for k in range(j1, j2):
59
+ errors.append(("", expected_words[k], "Missing word"))
60
+ elif opcode == "delete":
61
+ for k in range(i1, i2):
62
+ errors.append((transcribed_words[k], "", "Extra word"))
63
+ return errors
64
+
65
  def transcribe_audio(audio_path, original_text):
66
  try:
67
+ # 1. Load and pre-process audio
68
  waveform, sample_rate = torchaudio.load(audio_path)
69
  if waveform.shape[0] > 1:
70
  waveform = waveform.mean(dim=0, keepdim=True)
 
72
  transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
73
  waveform = transform(waveform)
74
 
75
+ # Amplify voice intensity
76
+ GAIN = 1.5
77
  waveform = waveform * GAIN
78
+ waveform = torch.clamp(waveform, -1.0, 1.0)
79
 
80
  input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
81
+
82
  # 2. Transcribe with AI4Bharat model
83
  with torch.no_grad():
84
  logits = model(input_values).logits
85
  predicted_ids = torch.argmax(logits, dim=-1)
86
  transcription = processor.decode(predicted_ids[0])
87
 
88
+ # 3. Error analysis (as table)
89
+ errors = compare_hindi_sentences(original_text, transcription)
90
+ df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
 
 
91
 
92
+ # Speaking speed
93
+ transcribed_words = transcription.strip().split()
94
  duration = waveform.shape[1] / 16000
95
  speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
96
 
97
  result = {
98
  "📝 Transcribed Text": transcription,
99
+ "⏱️ Speaking Speed (words/sec)": speed,
 
100
  }
101
+ # Return table as a separate output (Gradio Dataframe)
102
+ return result, df_errors
 
103
 
104
+ except Exception as e:
105
+ return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
106
 
107
  with gr.Blocks() as app:
108
  gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
 
118
 
119
  submit_button = gr.Button("✅ Submit Recording for Checking")
120
  output = gr.JSON(label="Results")
121
+ error_table = gr.Dataframe(headers=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"], label="गलती तालिका (Error Table)")
122
 
123
+ submit_button.click(
124
+ transcribe_audio,
125
+ inputs=[audio_input, input_text],
126
+ outputs=[output, error_table]
127
+ )
128
 
129
  app.launch()