speech_recognize

Runtime error

File size: 7,638 Bytes

46e19ef
edfaf92
d558c26
 
 
85d956d
0d0b31f
3b3baf1
0d0b31f
83632fe
f975491
7515a2b
 
 
2afc1aa
7515a2b
746e430
 
 
 
 
 
a893e98
7515a2b
746e430
 
 
 
 
 
7515a2b
746e430
 
 
 
 
7515a2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d558c26
 
 
0f433ab
a893e98
d558c26
 
 
 
7515a2b
859be22
 
f634650
859be22
f634650
859be22
f634650
859be22
f634650
37a3491
3b3baf1
61c9f90
a893e98
61c9f90
 
7515a2b
d558c26
3e9568e
d0031d1
 
a893e98
61c9f90
d0031d1
a893e98
61c9f90
d558c26
a893e98
 
91a2ea1
0f433ab
40cb73b
3e9568e
37a3491
a893e98
37a3491
61c9f90
9586c71
 
91a2ea1
37a3491
7515a2b
 
 
 
a225d4f
7515a2b
0d0b31f
a893e98
 
d558c26
d0031d1
a225d4f
85d956d
290e8e0
 
79b4e39
3e9568e
290e8e0
 
2c8cf7f
85d956d
d558c26
8ef9310
d558c26
8ef9310
d558c26
 
 
 
 
a225d4f
58f3405
d558c26
 
8ef9310
58f3405
0498d1c
8ef9310
257e787
23f5423
 
d558c26
 
85d956d
 
 
 
0d0b31f
85d956d
 
 
 
 
 
8a9dc3b
d558c26
7515a2b

import os
import requests
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io
from pydub import AudioSegment
import time
import eng_to_ipa as ipa

# Create audio directory if it doesn't exist
if not os.path.exists('audio'):
    os.makedirs('audio')

# Step 2: Create pronunciation audio for incorrect words
def upfilepath(local_filename):
    ts = time.time()
    upload_url = f"https://mr2along-speech-recognize.hf.space/gradio_api/upload?upload_id={ts}"
    files = {'files': open(local_filename, 'rb')}

    try:
        response = requests.post(upload_url, files=files, timeout=30)
        
        if response.status_code == 200:
            result = response.json()
            extracted_path = result[0]
            return extracted_path
        else:
            return None

    except requests.exceptions.Timeout:
        return "Request timed out. Please try again."
    except Exception as e:
        return f"An error occurred: {e}"

# Step 1: Transcribe the audio file
def transcribe_audio(audio):
    if audio is None:
        return "No audio file provided."
    
    recognizer = sr.Recognizer()

    if not os.path.isfile(audio):
        return "Audio file not found."

    audio_format = audio.split('.')[-1].lower()
    
    if audio_format != 'wav':
        try:
            audio_segment = AudioSegment.from_file(audio)
            wav_path = audio.replace(audio_format, 'wav')
            audio_segment.export(wav_path, format='wav')
            audio = wav_path
        except Exception as e:
            return f"Error converting audio: {e}"

    audio_file = sr.AudioFile(audio)
    with audio_file as source:
        audio_data = recognizer.record(source)

    try:
        transcription = recognizer.recognize_google(audio_data)
        return transcription
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio."
    except sr.RequestError as e:
        return f"Error with Google Speech Recognition service: {e}"

# Function to get IPA transcription
def ipa_transcription(sentence):
    try:
        ipa_text = ipa.convert(sentence)
        return ipa_text
    except Exception as e:
        return f"Error during IPA transcription: {e}"

# Step 2: Create pronunciation audio for incorrect words (locally)
def create_pronunciation_audio(word):
    try:
        tts = gTTS(word)
        audio_file_path = f"audio/{word}.mp3"
        tts.save(audio_file_path)
        return audio_file_path  # Return the local path instead of uploading
    except Exception as e:
        return f"Failed to create pronunciation audio: {e}"

# Step 3: Compare the transcribed text with the input paragraph
def compare_texts(reference_text, transcribed_text):
    reference_words = reference_text.split()
    transcribed_words = transcribed_text.split()
    incorrect_words_audios = []  # Store audio paths for incorrect words
    word_score_list = []  # To store each word's score

    sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
    similarity_score = round(sm.ratio() * 100, 2)

    # Construct HTML output with detailed fidelity class
    html_output = f"<strong>Fidelity Class:</strong> "
    if similarity_score >= 85:
        html_output += f"<strong>GOOD (>=85%)</strong><br>"
    elif similarity_score >= 70:
        html_output += f"<strong>ACCEPTABLE (70% - 85%)</strong><br>"
    elif similarity_score >= 50:
        html_output += f"<strong>NEEDS IMPROVEMENT (50% - 70%)</strong><br>"
    else:
        html_output += f"<strong>POOR (<50%)</strong><br>"

    html_output += f"<strong>Quality Score:</strong> {similarity_score}%<br>"
    html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
    html_output += f"<strong>IPA Transcription:</strong> {ipa_transcription(reference_text)}<br>"
    html_output += "<strong>Word Score List:</strong><br>"

    # Generate colored word score list
    for i, word in enumerate(reference_words):
        try:
            # Compare with transcribed words and assign quality scores
            if i < len(transcribed_words) and word.lower() == transcribed_words[i].lower():
                word_score_list.append({"quality_score": 100, "word": word})
                html_output += f'<span style="color: green;">{word}</span> '  # Correct words in green
            elif i < len(transcribed_words) and difflib.get_close_matches(word, [transcribed_words[i]]):
                word_score_list.append({"quality_score": 80, "word": word})  # Close matches
                html_output += f'<span style="color: yellow;">{word}</span> '  # Close matches in yellow
            else:
                word_score_list.append({"quality_score": 0, "word": word})
                html_output += f'<span style="color: red;">{word}</span> '  # Incorrect words in red
                # Create pronunciation audio for the incorrect word
                audio_file_path = create_pronunciation_audio(word)
                incorrect_words_audios.append((word, audio_file_path))
        except IndexError:
            # Word in reference that was not transcribed
            word_score_list.append({"quality_score": 0, "word": word})
            html_output += f'<span style="color: red;">{word}</span> '

    # Provide audio for incorrect words
    if incorrect_words_audios:
        html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
        for word, audio in incorrect_words_audios:
            suggestion = difflib.get_close_matches(word, reference_words, n=1)
            suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
            up_audio = upfilepath(audio)
            audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}"
            html_output += f'{word}: '
            html_output += f'<audio controls><source src="{audio_src}" type="audio/mpeg">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'

    # Return structured data
    return [html_output, word_score_list]


# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
    if not paragraph:
        return None  # Handle the case when no text is provided
    
    tts = gTTS(paragraph)
    audio_file_path = "audio/paragraph.mp3"  # Save the audio to a file
    tts.save(audio_file_path)
    return audio_file_path  # Return the file path

# Gradio Interface Function
def gradio_function(paragraph, audio):
    # Transcribe the audio
    transcribed_text = transcribe_audio(audio)
    # Compare the original paragraph with the transcribed text
    comparison_result = compare_texts(paragraph, transcribed_text)

    # Return comparison result
    return comparison_result
    
# Gradio Interface using the updated API
interface = gr.Interface(
    fn=gradio_function, 
    inputs=[
        gr.Textbox(lines=5, label="Input Paragraph"),
        gr.Audio(type="filepath", label="Record Audio")
    ], 
    outputs=["html"],
    title="Speech Recognition Comparison",
    description="Input a paragraph, record your audio, and compare the transcription to the original text."
)

# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
    outputs=gr.Audio(label="Text-to-Speech Output"),
    title="Text-to-Speech",
    description="This tool will read your input paragraph aloud."
)

# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

# Launch Gradio app
demo.launch()