import os
import requests
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io
from pydub import AudioSegment
import time
# Create audio directory if it doesn't exist
if not os.path.exists('audio'):
os.makedirs('audio')
filePath
def upfilepath():
# URL của tệp âm thanh (nguồn từ internet)
file_url = "https://st.ielts-fighter.com/src/ielts-fighter/2019/09/09/i%20ng%E1%BA%AFn.mp3"
# URL để upload tệp âm thanh
upload_url = "https://mr2along-speech-recognize.hf.space/gradio_api/upload?upload_id=yw08d344te"
# Tải tệp âm thanh từ link
response = requests.get(file_url)
# Kiểm tra xem tải tệp thành công hay không
if response.status_code == 200:
# Lưu tệp vào bộ nhớ tạm thời
local_filename = "temp_audio_file.mp3"
with open(local_filename, 'wb') as f:
f.write(response.content)
# Dữ liệu tệp cần upload
files = {'files': open(local_filename, 'rb')}
# Gửi yêu cầu POST
response = requests.post(upload_url, files=files)
# Kiểm tra kết quả trả về từ server
if response.status_code == 200:
print("Upload thành công!")
result=response.json()
extracted_path = os.path.dirname(result[0])
print(extracted_path) # In kết quả nếu server trả về dưới dạng JSON
return extracted_path
else:
print(f"Lỗi: {response.status_code}")
print(response.text) # In thông báo lỗi từ server
# Step 1: Transcribe the audio file
def transcribe_audio(audio):
if audio is None:
return "No audio file provided." # Handle the case when no audio is uploaded
recognizer = sr.Recognizer()
audio_format = audio.split('.')[-1].lower()
# Convert to WAV if the audio is not in a supported format
if audio_format != 'wav':
try:
# Load the audio file with pydub
audio_segment = AudioSegment.from_file(audio)
wav_path = audio.replace(audio_format, 'wav')
audio_segment.export(wav_path, format='wav') # Convert to WAV
audio = wav_path # Update audio path to the converted file
except Exception as e:
return f"Error converting audio: {e}"
# Convert audio into recognizable format for the Recognizer
audio_file = sr.AudioFile(audio)
with audio_file as source:
audio_data = recognizer.record(source)
try:
# Recognize the audio using Google Web Speech API
transcription = recognizer.recognize_google(audio_data)
return transcription
except sr.UnknownValueError:
return "Google Speech Recognition could not understand the audio"
except sr.RequestError as e:
return f"Error with Google Speech Recognition service: {e}"
# Step 2: Create pronunciation audio for incorrect words
def create_pronunciation_audio(word):
time.sleep(5) # Chờ 5 giây
tts = gTTS(word)
main_url="https://mr2along-speech-recognize.hf.space/gradio_api/file="
audio_file_path = f"audio/{word}.mp3" # Save the audio to a file
tts.save(audio_file_path)
print(f"audio/{word}: {main_url}{filePath}/{audio_file_path}")
return f"{main_url}{filePath}/{audio_file_path}" # Return the file path of the saved audio
# Step 3: Compare the transcribed text with the input paragraph
def compare_texts(reference_text, transcribed_text):
reference_words = reference_text.split()
transcribed_words = transcribed_text.split()
incorrect_words_audios = [] # Store audio paths for incorrect words
sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
similarity_score = round(sm.ratio() * 100, 2)
# Construct HTML output
html_output = f"Fidelity Class: {'CORRECT' if similarity_score > 50 else 'INCORRECT'} "
html_output += f"Quality Score: {similarity_score} "
html_output += f"Transcribed Text: {transcribed_text} "
html_output += "Word Score List: "
# Generate colored word score list
for i, word in enumerate(reference_words):
try:
if word.lower() == transcribed_words[i].lower():
html_output += f'{word} ' # Correct words in green
elif difflib.get_close_matches(word, transcribed_words):
html_output += f'{word} ' # Close matches in yellow
else:
# Incorrect words in red
html_output += f'{word} '
# Create pronunciation audio for the incorrect word
audio_file_path = create_pronunciation_audio(word)
incorrect_words_audios.append((word, audio_file_path))
except IndexError:
html_output += f'{word} ' # Words in reference that were not transcribed
# Provide audio for incorrect words
if incorrect_words_audios:
html_output += " Pronunciation for Incorrect Words: "
for word, audio in incorrect_words_audios:
suggestion = difflib.get_close_matches(word, reference_words, n=1)
suggestion_text = f" (Did you mean: {suggestion[0]}?)" if suggestion else ""
html_output += f'{word}: '
html_output += f'{suggestion_text} '
return html_output
# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
if not paragraph:
return None # Handle the case when no text is provided
tts = gTTS(paragraph)
audio_file_path = "audio/paragraph.mp3" # Save the audio to a file
tts.save(audio_file_path)
return audio_file_path # Return the file path instead of None
# Gradio Interface Function
def gradio_function(paragraph, audio):
# Transcribe the audio
filePath=upfilepath()
transcribed_text = transcribe_audio(audio)
# Compare the original paragraph with the transcribed text
comparison_result = compare_texts(paragraph, transcribed_text)
# Return comparison result
return comparison_result
# Gradio Interface using the updated API
interface = gr.Interface(
fn=gradio_function,
inputs=[
gr.Textbox(lines=5, label="Input Paragraph"),
gr.Audio(type="filepath", label="Record Audio")
],
outputs="html",
title="Speech Recognition Comparison",
description="Input a paragraph, record your audio, and compare the transcription to the original text."
)
# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
fn=text_to_speech,
inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
outputs=gr.Audio(label="Text-to-Speech Output"),
title="Text-to-Speech",
description="This tool will read your input paragraph aloud."
)
# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])
# Launch Gradio app
demo.launch()