Spaces:

NLPV
/

ReadabilityTest

Sleeping

App Files Files Community

NLPV commited on Jul 14

Commit

e3afeb6

verified ·

1 Parent(s): 89b33e7

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -25

app.py CHANGED Viewed

@@ -1,24 +1,18 @@
 import gradio as gr
 from gtts import gTTS
 import tempfile
-import os
-import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-import torchaudio
 import difflib
 import pandas as pd
 from Levenshtein import distance as lev_distance
-# Load AI4Bharat Hindi model & processor
-MODEL_NAME = "ai4bharat/indicwav2vec-hindi"
-processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
 def play_text(text):
     tts = gTTS(text=text, lang='hi', slow=False)
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
     tts.save(temp_file.name)
-    # Return file for Gradio audio output
     return temp_file.name
 def get_error_type(asr_word, correct_word):
@@ -55,37 +49,43 @@ def compare_hindi_sentences(expected, transcribed):
                 errors.append((transcribed_words[k], "", "Extra word"))
     return errors
 def transcribe_audio(audio_path, original_text):
     try:
-        waveform, sample_rate = torchaudio.load(audio_path)
-        if waveform.shape[0] > 1:
-            waveform = waveform.mean(dim=0, keepdim=True)
-        if sample_rate != 16000:
-            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
-            waveform = transform(waveform)
-        waveform = waveform / waveform.abs().max()
-        input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values
-        with torch.no_grad():
-            logits = model(input_values).logits
-        predicted_ids = torch.argmax(logits, dim=-1)
-        transcription = processor.decode(predicted_ids[0])
         # Error analysis
         errors = compare_hindi_sentences(original_text, transcription)
         df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
         # Speaking speed
         transcribed_words = transcription.strip().split()
-        duration = waveform.shape[1] / 16000
         speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
-        result = {
             "📝 Transcribed Text": transcription,
             "⏱️ Speaking Speed (words/sec)": speed,
         }
-        return result, df_errors
     except Exception as e:
         return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
 with gr.Blocks() as app:
-    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (AI4Bharat Model)")
     with gr.Row():
         input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
         play_button = gr.Button("🔊 Listen to Text")

 import gradio as gr
 from gtts import gTTS
 import tempfile
 import difflib
 import pandas as pd
 from Levenshtein import distance as lev_distance
+import whisper
+# Load Whisper model once (choose "small" or "medium" for better results)
+model = whisper.load_model("small")
 def play_text(text):
     tts = gTTS(text=text, lang='hi', slow=False)
     temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
     tts.save(temp_file.name)
     return temp_file.name
 def get_error_type(asr_word, correct_word):
                 errors.append((transcribed_words[k], "", "Extra word"))
     return errors
+def calculate_accuracy(expected, transcribed):
+    expected_words = expected.strip().split()
+    transcribed_words = transcribed.strip().split()
+    matcher = difflib.SequenceMatcher(None, transcribed_words, expected_words)
+    correct = 0
+    total = len(expected_words)
+    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
+        if tag == 'equal':
+            correct += (j2-j1)
+    accuracy = (correct / total) * 100 if total > 0 else 0
+    return round(accuracy, 2)
 def transcribe_audio(audio_path, original_text):
     try:
+        # Use Whisper for transcription
+        result = model.transcribe(audio_path, language='hi')
+        transcription = result['text'].strip()
         # Error analysis
         errors = compare_hindi_sentences(original_text, transcription)
         df_errors = pd.DataFrame(errors, columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
         # Speaking speed
         transcribed_words = transcription.strip().split()
+        duration = result['segments'][-1]['end'] if result.get('segments') else 1.0
         speed = round(len(transcribed_words) / duration, 2) if duration > 0 else 0
+        # Accuracy
+        accuracy = calculate_accuracy(original_text, transcription)
+        result_dict = {
             "📝 Transcribed Text": transcription,
             "⏱️ Speaking Speed (words/sec)": speed,
+            "✅ Reading Accuracy (%)": accuracy,
         }
+        return result_dict, df_errors
     except Exception as e:
         return {"error": str(e)}, pd.DataFrame(columns=["बिगड़ा हुआ शब्द", "संभावित सही शब्द", "गलती का प्रकार"])
 with gr.Blocks() as app:
+    gr.Markdown("## 🗣️ Hindi Reading & Pronunciation Practice App (OpenAI Whisper)")
     with gr.Row():
         input_text = gr.Textbox(label="Paste Hindi Text Here", placeholder="यहाँ हिंदी टेक्स्ट लिखें...")
         play_button = gr.Button("🔊 Listen to Text")