NLPV commited on
Commit
81a3a36
·
verified ·
1 Parent(s): 5bac4c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -28
app.py CHANGED
@@ -1,40 +1,52 @@
1
  import gradio as gr
2
  from gtts import gTTS
3
- import time
4
- import difflib
5
  import tempfile
6
  import os
7
- import speech_recognition as sr
8
- from faster_whisper import WhisperModel
 
 
 
 
 
 
 
 
9
 
10
- # Function to play the text (optional)
11
  def play_text(text):
12
  tts = gTTS(text=text, lang='hi', slow=False)
13
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
14
  tts.save(temp_file.name)
15
- os.system(f"start {temp_file.name}") # Windows
 
16
  return "✅ Text is being read out. Please listen and read it yourself."
17
 
18
- # Load model once (outside function for efficiency)
19
- model = WhisperModel("small", compute_type="float32") # Or "medium" for better accuracy
20
-
21
- def transcribe_audio(audio, original_text):
22
  try:
23
- # Run inference
24
- segments, info = model.transcribe(audio, language='hi')
 
 
 
 
 
 
25
 
26
- transcription = " ".join([segment.text for segment in segments])
 
 
 
 
27
 
28
- # Clean and split the text better
29
- import re
30
  original_words = re.findall(r'\w+', original_text.strip())
31
  transcribed_words = re.findall(r'\w+', transcription.strip())
32
-
33
  matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
34
  accuracy = round(matcher.ratio() * 100, 2)
35
 
36
- # Speaking speed (approximate)
37
- speed = round(len(transcribed_words) / info.duration, 2)
 
38
 
39
  result = {
40
  "📝 Transcribed Text": transcription,
@@ -45,11 +57,9 @@ def transcribe_audio(audio, original_text):
45
  except Exception as e:
46
  return {"error": str(e)}
47
 
48
-
49
- # Gradio App
50
  with gr.Blocks() as app:
51
- gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App")
52
-
53
  with gr.Row():
54
  input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
55
  play_button = gr.Button("🔊 Listen to Text")
@@ -58,14 +68,10 @@ with gr.Blocks() as app:
58
 
59
  gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
60
  audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
61
-
62
  submit_button = gr.Button("✅ Submit Recording for Checking")
63
-
64
  output = gr.JSON(label="Results")
65
-
66
  submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
67
 
68
- # Launch the app
69
  app.launch()
70
-
71
-
 
1
  import gradio as gr
2
  from gtts import gTTS
 
 
3
  import tempfile
4
  import os
5
+ import difflib
6
+ import torch
7
+ import re
8
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
9
+ import torchaudio
10
+
11
+ # Load AI4Bharat Hindi model & processor
12
+ MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
13
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
14
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
15
 
 
16
  def play_text(text):
17
  tts = gTTS(text=text, lang='hi', slow=False)
18
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
19
  tts.save(temp_file.name)
20
+ # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
21
+ os.system(f"start {temp_file.name}")
22
  return "✅ Text is being read out. Please listen and read it yourself."
23
 
24
+ def transcribe_audio(audio_path, original_text):
 
 
 
25
  try:
26
+ # 1. Load audio & convert to mono, 16kHz if needed
27
+ waveform, sample_rate = torchaudio.load(audio_path)
28
+ if waveform.shape[0] > 1:
29
+ waveform = waveform.mean(dim=0, keepdim=True)
30
+ if sample_rate != 16000:
31
+ transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
32
+ waveform = transform(waveform)
33
+ input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
34
 
35
+ # 2. Transcribe with AI4Bharat model
36
+ with torch.no_grad():
37
+ logits = model(input_values).logits
38
+ predicted_ids = torch.argmax(logits, dim=-1)
39
+ transcription = processor.decode(predicted_ids[0])
40
 
41
+ # 3. Calculate accuracy etc.
 
42
  original_words = re.findall(r'\w+', original_text.strip())
43
  transcribed_words = re.findall(r'\w+', transcription.strip())
 
44
  matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
45
  accuracy = round(matcher.ratio() * 100, 2)
46
 
47
+ # Speaking speed approximation (needs duration, which torchaudio gives)
48
+ duration = waveform.shape[1] / 16000
49
+ speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
50
 
51
  result = {
52
  "📝 Transcribed Text": transcription,
 
57
  except Exception as e:
58
  return {"error": str(e)}
59
 
 
 
60
  with gr.Blocks() as app:
61
+ gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
62
+
63
  with gr.Row():
64
  input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
65
  play_button = gr.Button("🔊 Listen to Text")
 
68
 
69
  gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
70
  audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
71
+
72
  submit_button = gr.Button("✅ Submit Recording for Checking")
 
73
  output = gr.JSON(label="Results")
74
+
75
  submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
76
 
 
77
  app.launch()