Spaces:

NLPV
/

ReadabilityTest

Sleeping

App Files Files Community

NLPV commited on Jul 14

Commit

81a3a36

verified ·

1 Parent(s): 5bac4c2

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -28

app.py CHANGED Viewed

@@ -1,40 +1,52 @@
 import gradio as gr
 from gtts import gTTS
-import time
-import difflib
 import tempfile
 import os
-import speech_recognition as sr
-from faster_whisper import WhisperModel
-# Function to play the text (optional)
 def play_text(text):
     tts = gTTS(text=text, lang='hi', slow=False)
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
     tts.save(temp_file.name)
-    os.system(f"start {temp_file.name}")  # Windows
     return "✅ Text is being read out. Please listen and read it yourself."
-# Load model once (outside function for efficiency)
-model = WhisperModel("small", compute_type="float32")  # Or "medium" for better accuracy
-def transcribe_audio(audio, original_text):
     try:
-        # Run inference
-        segments, info = model.transcribe(audio, language='hi')
-        transcription = " ".join([segment.text for segment in segments])
-        # Clean and split the text better
-        import re
         original_words = re.findall(r'\w+', original_text.strip())
         transcribed_words = re.findall(r'\w+', transcription.strip())
         matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
         accuracy = round(matcher.ratio() * 100, 2)
-        # Speaking speed (approximate)
-        speed = round(len(transcribed_words) / info.duration, 2)
         result = {
             "📝 Transcribed Text": transcription,
@@ -45,11 +57,9 @@ def transcribe_audio(audio, original_text):
     except Exception as e:
         return {"error": str(e)}
-# Gradio App
 with gr.Blocks() as app:
-    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App")
     with gr.Row():
         input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
         play_button = gr.Button("🔊 Listen to Text")
@@ -58,14 +68,10 @@ with gr.Blocks() as app:
     gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
     audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
     submit_button = gr.Button("✅ Submit Recording for Checking")
     output = gr.JSON(label="Results")
     submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
-# Launch the app
 app.launch()

 import gradio as gr
 from gtts import gTTS
 import tempfile
 import os
+import difflib
+import torch
+import re
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import torchaudio
+# Load AI4Bharat Hindi model & processor
+MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
+processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
 def play_text(text):
     tts = gTTS(text=text, lang='hi', slow=False)
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
     tts.save(temp_file.name)
+    # Windows: "start", Mac: "afplay", Linux: "mpg123" (edit as needed)
+    os.system(f"start {temp_file.name}")
     return "✅ Text is being read out. Please listen and read it yourself."
+def transcribe_audio(audio_path, original_text):
     try:
+        # 1. Load audio & convert to mono, 16kHz if needed
+        waveform, sample_rate = torchaudio.load(audio_path)
+        if waveform.shape[0] > 1:
+            waveform = waveform.mean(dim=0, keepdim=True)
+        if sample_rate != 16000:
+            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+            waveform = transform(waveform)
+        input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
+        # 2. Transcribe with AI4Bharat model
+        with torch.no_grad():
+            logits = model(input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.decode(predicted_ids[0])
+        # 3. Calculate accuracy etc.
         original_words = re.findall(r'\w+', original_text.strip())
         transcribed_words = re.findall(r'\w+', transcription.strip())
         matcher = difflib.SequenceMatcher(None, original_words, transcribed_words)
         accuracy = round(matcher.ratio() * 100, 2)
+        # Speaking speed approximation (needs duration, which torchaudio gives)
+        duration = waveform.shape[1] / 16000
+        speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
         result = {
             "📝 Transcribed Text": transcription,
     except Exception as e:
         return {"error": str(e)}
 with gr.Blocks() as app:
+    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
     with gr.Row():
         input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
         play_button = gr.Button("🔊 Listen to Text")
     gr.Markdown("### 🎤 Now upload or record yourself reading the text aloud below:")
     audio_input = gr.Audio(type="filepath", label="Upload or Record Your Voice")
     submit_button = gr.Button("✅ Submit Recording for Checking")
     output = gr.JSON(label="Results")
     submit_button.click(transcribe_audio, inputs=[audio_input, input_text], outputs=[output])
 app.launch()