speech_recognize

Runtime error

App Files Files Community

speech_recognize / app.py

mr2along

Update app.py

a225d4f verified 16 days ago

raw

history blame

7.7 kB

	import os
	import requests
	import speech_recognition as sr
	import difflib
	import gradio as gr
	from gtts import gTTS
	import io
	from pydub import AudioSegment
	import time
	import pronouncing
	import epitran

	# Create audio directory if it doesn't exist
	if not os.path.exists('audio'):
	os.makedirs('audio')

	# Initialize the epitran object for English
	try:
	epi = epitran.Epitran('eng-Latn')
	except Exception as e:
	print(f"Error initializing Epitran: {e}")

	# Step 1: Transcribe the audio file
	def transcribe_audio(audio):
	if audio is None:
	return "No audio file provided."

	recognizer = sr.Recognizer()

	# Check if the file exists
	if not os.path.isfile(audio):
	return "Audio file not found."

	audio_format = audio.split('.')[-1].lower()

	if audio_format != 'wav':
	try:
	audio_segment = AudioSegment.from_file(audio)
	wav_path = audio.replace(audio_format, 'wav')
	audio_segment.export(wav_path, format='wav')
	audio = wav_path
	except Exception as e:
	return f"Error converting audio: {e}"

	audio_file = sr.AudioFile(audio)
	with audio_file as source:
	audio_data = recognizer.record(source)

	try:
	transcription = recognizer.recognize_google(audio_data)
	return transcription
	except sr.UnknownValueError:
	return "Google Speech Recognition could not understand the audio"
	except sr.RequestError as e:
	return f"Error with Google Speech Recognition service: {e}"

	# Step 2: Create pronunciation audio for incorrect words (locally)
	def create_pronunciation_audio(word):
	try:
	tts = gTTS(word)
	audio_file_path = f"audio/{word}.mp3"
	tts.save(audio_file_path)
	return audio_file_path # Return the local path instead of uploading
	except Exception as e:
	return f"Failed to create pronunciation audio: {e}"

	# Function for phonetic respelling
	def phonetic_respelling(sentence):
	words = sentence.split()
	respelled = []

	for word in words:
	# Find close matches for each word
	close_matches = pronouncing.search(word)
	if close_matches:
	# Get the first close match
	closest_word = close_matches[0]
	respelled.append(pronouncing.phones_for_word(closest_word)[0]) # Use phonemes for the closest match
	else:
	respelled.append(word)

	# Convert phonemes to respelling
	respelling = ' '.join(respelled)

	# Replace phonemes with common respellings
	respelling = respelling.replace('ˈ', '').replace('ˌ', '').replace('ː', '') # Clean up phoneme symbols
	respelling = respelling.replace('ɑ', 'a').replace('ə', 'uh').replace('ɪ', 'i').replace('ʊ', 'u') # Sample conversions

	return respelling

	# Function for IPA transcription
	def ipa_transcription(sentence):
	try:
	return epi.transliterate(sentence)
	except Exception as e:
	print(f"Error during IPA transcription: {e}")
	return "IPA transcription failed."

	# Step 3: Compare the transcribed text with the input paragraph
	def compare_texts(reference_text, transcribed_text):
	reference_words = reference_text.split()
	transcribed_words = transcribed_text.split()
	incorrect_words_audios = [] # Store audio paths for incorrect words

	sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
	similarity_score = round(sm.ratio() * 100, 2)

	# Construct HTML output with detailed fidelity class
	html_output = f"<strong>Fidelity Class:</strong> "
	if similarity_score >= 85:
	html_output += f"<strong>GOOD (>=85%)</strong><br>"
	elif similarity_score >= 70:
	html_output += f"<strong>ACCEPTABLE (70% - 85%)</strong><br>"
	elif similarity_score >= 50:
	html_output += f"<strong>NEEDS IMPROVEMENT (50% - 70%)</strong><br>"
	else:
	html_output += f"<strong>POOR (<50%)</strong><br>"

	html_output += f"<strong>Quality Score:</strong> {similarity_score}%<br>"
	html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
	html_output += f"<strong>Input Sentence:</strong> {reference_text}<br>"
	html_output += f"<strong>Phonetic Respelling:</strong> {phonetic_respelling(reference_text)}<br>"
	html_output += f"<strong>IPA Transcription:</strong> {ipa_transcription(reference_text)}<br>"
	html_output += "<strong>Word Score List:</strong><br>"

	# Generate colored word score list
	for i, word in enumerate(reference_words):
	try:
	if word.lower() == transcribed_words[i].lower():
	html_output += f'<span style="color: green;">{word}</span> ' # Correct words in green
	elif difflib.get_close_matches(word, [transcribed_words[i]]):
	html_output += f'<span style="color: yellow;">{word}</span> ' # Close matches in yellow
	else:
	# Incorrect words in red
	html_output += f'<span style="color: red;">{word}</span> '
	# Create pronunciation audio for the incorrect word
	audio_file_path = create_pronunciation_audio(word)
	incorrect_words_audios.append((word, audio_file_path))
	except IndexError:
	# Word in reference that was not transcribed
	html_output += f'<span style="color: red;">{word}</span> '

	# Provide audio for incorrect words
	if incorrect_words_audios:
	html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
	for word, audio in incorrect_words_audios:
	suggestion = difflib.get_close_matches(word, reference_words, n=1)
	suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
	up_audio = upfilepath(audio)
	audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}"
	html_output += f'{word}: '
	html_output += f'<audio controls><source src="{audio_src}" type="audio/mpeg">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'

	return [html_output]

	# Step 4: Text-to-Speech Function
	def text_to_speech(paragraph):
	if not paragraph:
	return None # Handle the case when no text is provided

	tts = gTTS(paragraph)
	audio_file_path = "audio/paragraph.mp3" # Save the audio to a file
	tts.save(audio_file_path)
	return audio_file_path # Return the file path

	# Gradio Interface Function
	def gradio_function(paragraph, audio):
	# Transcribe the audio
	transcribed_text = transcribe_audio(audio)
	# Compare the original paragraph with the transcribed text
	comparison_result = compare_texts(paragraph, transcribed_text)

	# Return comparison result
	return comparison_result

	# Gradio Interface using the updated API
	interface = gr.Interface(
	fn=gradio_function,
	inputs=[
	gr.Textbox(lines=5, label="Input Paragraph"),
	gr.Audio(type="filepath", label="Record Audio")
	],
	outputs=["html"],
	title="Speech Recognition Comparison",
	description="Input a paragraph, record your audio, and compare the transcription to the original text."
	)

	# Gradio Interface for Text-to-Speech
	tts_interface = gr.Interface(
	fn=text_to_speech,
	inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
	outputs=gr.Audio(label="Text-to-Speech Output"),
	title="Text-to-Speech",
	description="This tool will read your input paragraph aloud."
	)

	# Combine both interfaces into one
	demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

	# Launch Gradio app
	demo.launch()