Spaces:
Runtime error
Runtime error
File size: 7,638 Bytes
46e19ef edfaf92 d558c26 85d956d 0d0b31f 3b3baf1 0d0b31f 83632fe f975491 7515a2b 2afc1aa 7515a2b 746e430 a893e98 7515a2b 746e430 7515a2b 746e430 7515a2b d558c26 0f433ab a893e98 d558c26 7515a2b 859be22 f634650 859be22 f634650 859be22 f634650 859be22 f634650 37a3491 3b3baf1 61c9f90 a893e98 61c9f90 7515a2b d558c26 3e9568e d0031d1 a893e98 61c9f90 d0031d1 a893e98 61c9f90 d558c26 a893e98 91a2ea1 0f433ab 40cb73b 3e9568e 37a3491 a893e98 37a3491 61c9f90 9586c71 91a2ea1 37a3491 7515a2b a225d4f 7515a2b 0d0b31f a893e98 d558c26 d0031d1 a225d4f 85d956d 290e8e0 79b4e39 3e9568e 290e8e0 2c8cf7f 85d956d d558c26 8ef9310 d558c26 8ef9310 d558c26 a225d4f 58f3405 d558c26 8ef9310 58f3405 0498d1c 8ef9310 257e787 23f5423 d558c26 85d956d 0d0b31f 85d956d 8a9dc3b d558c26 7515a2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import os
import requests
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io
from pydub import AudioSegment
import time
import eng_to_ipa as ipa
# Create audio directory if it doesn't exist
if not os.path.exists('audio'):
os.makedirs('audio')
# Step 2: Create pronunciation audio for incorrect words
def upfilepath(local_filename):
ts = time.time()
upload_url = f"https://mr2along-speech-recognize.hf.space/gradio_api/upload?upload_id={ts}"
files = {'files': open(local_filename, 'rb')}
try:
response = requests.post(upload_url, files=files, timeout=30)
if response.status_code == 200:
result = response.json()
extracted_path = result[0]
return extracted_path
else:
return None
except requests.exceptions.Timeout:
return "Request timed out. Please try again."
except Exception as e:
return f"An error occurred: {e}"
# Step 1: Transcribe the audio file
def transcribe_audio(audio):
if audio is None:
return "No audio file provided."
recognizer = sr.Recognizer()
if not os.path.isfile(audio):
return "Audio file not found."
audio_format = audio.split('.')[-1].lower()
if audio_format != 'wav':
try:
audio_segment = AudioSegment.from_file(audio)
wav_path = audio.replace(audio_format, 'wav')
audio_segment.export(wav_path, format='wav')
audio = wav_path
except Exception as e:
return f"Error converting audio: {e}"
audio_file = sr.AudioFile(audio)
with audio_file as source:
audio_data = recognizer.record(source)
try:
transcription = recognizer.recognize_google(audio_data)
return transcription
except sr.UnknownValueError:
return "Google Speech Recognition could not understand the audio."
except sr.RequestError as e:
return f"Error with Google Speech Recognition service: {e}"
# Function to get IPA transcription
def ipa_transcription(sentence):
try:
ipa_text = ipa.convert(sentence)
return ipa_text
except Exception as e:
return f"Error during IPA transcription: {e}"
# Step 2: Create pronunciation audio for incorrect words (locally)
def create_pronunciation_audio(word):
try:
tts = gTTS(word)
audio_file_path = f"audio/{word}.mp3"
tts.save(audio_file_path)
return audio_file_path # Return the local path instead of uploading
except Exception as e:
return f"Failed to create pronunciation audio: {e}"
# Step 3: Compare the transcribed text with the input paragraph
def compare_texts(reference_text, transcribed_text):
reference_words = reference_text.split()
transcribed_words = transcribed_text.split()
incorrect_words_audios = [] # Store audio paths for incorrect words
word_score_list = [] # To store each word's score
sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
similarity_score = round(sm.ratio() * 100, 2)
# Construct HTML output with detailed fidelity class
html_output = f"<strong>Fidelity Class:</strong> "
if similarity_score >= 85:
html_output += f"<strong>GOOD (>=85%)</strong><br>"
elif similarity_score >= 70:
html_output += f"<strong>ACCEPTABLE (70% - 85%)</strong><br>"
elif similarity_score >= 50:
html_output += f"<strong>NEEDS IMPROVEMENT (50% - 70%)</strong><br>"
else:
html_output += f"<strong>POOR (<50%)</strong><br>"
html_output += f"<strong>Quality Score:</strong> {similarity_score}%<br>"
html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
html_output += f"<strong>IPA Transcription:</strong> {ipa_transcription(reference_text)}<br>"
html_output += "<strong>Word Score List:</strong><br>"
# Generate colored word score list
for i, word in enumerate(reference_words):
try:
# Compare with transcribed words and assign quality scores
if i < len(transcribed_words) and word.lower() == transcribed_words[i].lower():
word_score_list.append({"quality_score": 100, "word": word})
html_output += f'<span style="color: green;">{word}</span> ' # Correct words in green
elif i < len(transcribed_words) and difflib.get_close_matches(word, [transcribed_words[i]]):
word_score_list.append({"quality_score": 80, "word": word}) # Close matches
html_output += f'<span style="color: yellow;">{word}</span> ' # Close matches in yellow
else:
word_score_list.append({"quality_score": 0, "word": word})
html_output += f'<span style="color: red;">{word}</span> ' # Incorrect words in red
# Create pronunciation audio for the incorrect word
audio_file_path = create_pronunciation_audio(word)
incorrect_words_audios.append((word, audio_file_path))
except IndexError:
# Word in reference that was not transcribed
word_score_list.append({"quality_score": 0, "word": word})
html_output += f'<span style="color: red;">{word}</span> '
# Provide audio for incorrect words
if incorrect_words_audios:
html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
for word, audio in incorrect_words_audios:
suggestion = difflib.get_close_matches(word, reference_words, n=1)
suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
up_audio = upfilepath(audio)
audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}"
html_output += f'{word}: '
html_output += f'<audio controls><source src="{audio_src}" type="audio/mpeg">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'
# Return structured data
return [html_output, word_score_list]
# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
if not paragraph:
return None # Handle the case when no text is provided
tts = gTTS(paragraph)
audio_file_path = "audio/paragraph.mp3" # Save the audio to a file
tts.save(audio_file_path)
return audio_file_path # Return the file path
# Gradio Interface Function
def gradio_function(paragraph, audio):
# Transcribe the audio
transcribed_text = transcribe_audio(audio)
# Compare the original paragraph with the transcribed text
comparison_result = compare_texts(paragraph, transcribed_text)
# Return comparison result
return comparison_result
# Gradio Interface using the updated API
interface = gr.Interface(
fn=gradio_function,
inputs=[
gr.Textbox(lines=5, label="Input Paragraph"),
gr.Audio(type="filepath", label="Record Audio")
],
outputs=["html"],
title="Speech Recognition Comparison",
description="Input a paragraph, record your audio, and compare the transcription to the original text."
)
# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
outputs=gr.Audio(label="Text-to-Speech Output"),
title="Text-to-Speech",
description="This tool will read your input paragraph aloud."
)
# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])
# Launch Gradio app
demo.launch() |