speech_recognize

Runtime error

File size: 5,550 Bytes

006f012
46e19ef
edfaf92
d558c26
 
 
85d956d
0d0b31f
3b3baf1
0d0b31f
83632fe
f975491
006f012
 
 
 
 
 
 
 
 
2afc1aa
006f012
746e430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
006f012
d558c26
 
 
0f433ab
d558c26
 
 
 
006f012
859be22
 
f634650
859be22
f634650
859be22
f634650
859be22
f634650
37a3491
3b3baf1
61c9f90
006f012
61c9f90
 
006f012
d558c26
3e9568e
d558c26
61c9f90
37a3491
61c9f90
d558c26
91a2ea1
 
 
0f433ab
40cb73b
3e9568e
37a3491
 
61c9f90
9586c71
 
91a2ea1
37a3491
006f012
 
a225d4f
006f012
0d0b31f
257e787
d558c26
006f012
a225d4f
85d956d
290e8e0
 
79b4e39
3e9568e
290e8e0
 
2c8cf7f
85d956d
d558c26
8ef9310
d558c26
8ef9310
d558c26
 
 
 
 
a225d4f
58f3405
d558c26
 
8ef9310
58f3405
0498d1c
8ef9310
257e787
23f5423
 
d558c26
 
85d956d
 
 
 
0d0b31f
85d956d
 
 
 
 
 
8a9dc3b
d558c26
f975491

# Import required libraries
import os
import requests
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io
from pydub import AudioSegment
import time
import eng_to_ipa as ipa

# Function to create pronunciation audio
def create_pronunciation_audio(word):
    try:
        tts = gTTS(word)
        audio_file_path = f"audio/{word}.mp3"
        tts.save(audio_file_path)
        return audio_file_path  # Return the local path instead of uploading
    except Exception as e:
        return f"Failed to create pronunciation audio: {e}"

# Function to upload audio files to the server
def upfilepath(local_filename):
    ts = time.time()
    upload_url = f"https://mr2along-speech-recognize.hf.space/gradio_api/upload?upload_id={ts}"
    files = {'files': open(local_filename, 'rb')}

    try:
        response = requests.post(upload_url, files=files, timeout=30)  # Set timeout (e.g., 30 seconds)
        if response.status_code == 200:
            result = response.json()
            extracted_path = result[0]
            return extracted_path
        else:
            return None
    except requests.exceptions.Timeout:
        return "Request timed out. Please try again."
    except Exception as e:
        return f"An error occurred: {e}"

# Update the compare_texts function
def compare_texts(reference_text, transcribed_text):
    reference_words = reference_text.split()
    transcribed_words = transcribed_text.split()
    incorrect_words_audios = []  # Store audio paths for incorrect words

    sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
    similarity_score = round(sm.ratio() * 100, 2)

    # Construct HTML output
    html_output = f"<strong>Fidelity Class:</strong> "
    if similarity_score >= 85:
        html_output += f"<strong>GOOD (>=85%)</strong><br>"
    elif similarity_score >= 70:
        html_output += f"<strong>ACCEPTABLE (70% - 85%)</strong><br>"
    elif similarity_score >= 50:
        html_output += f"<strong>NEEDS IMPROVEMENT (50% - 70%)</strong><br>"
    else:
        html_output += f"<strong>POOR (<50%)</strong><br>"

    html_output += f"<strong>Quality Score:</strong> {similarity_score}%<br>"
    html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
    html_output += f"<strong>IPA Transcription:</strong> {ipa_transcription(reference_text)}<br>"
    html_output += "<strong>Word Score List:</strong><br>"

    # Generate colored word score list and audio links
    for i, word in enumerate(reference_words):
        try:
            if word.lower() == transcribed_words[i].lower():
                html_output += f'<span style="color: green;">{word}</span> '  # Correct words in green
            elif difflib.get_close_matches(word, [transcribed_words[i]]):
                html_output += f'<span style="color: yellow;">{word}</span> '  # Close matches in yellow
            else:
                # Incorrect words in red
                html_output += f'<span style="color: red;">{word}</span> '
                # Create pronunciation audio for the incorrect word
                audio_file_path = create_pronunciation_audio(word)
                incorrect_words_audios.append((word, audio_file_path))
        except IndexError:
            # Word in reference that was not transcribed
            html_output += f'<span style="color: red;">{word}</span> '

    # Provide audio for incorrect words
    if incorrect_words_audios:
        html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
        for word, audio in incorrect_words_audios:
            up_audio = upfilepath(audio)  # Upload the audio
            audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}"  # Use the upload URL
            html_output += f'{word}: '
            html_output += f'<audio controls><source src="{audio_src}" type="audio/mpeg">Your browser does not support the audio tag.</audio><br>'

    return [html_output]


# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
    if not paragraph:
        return None  # Handle the case when no text is provided
    
    tts = gTTS(paragraph)
    audio_file_path = "audio/paragraph.mp3"  # Save the audio to a file
    tts.save(audio_file_path)
    return audio_file_path  # Return the file path

# Gradio Interface Function
def gradio_function(paragraph, audio):
    # Transcribe the audio
    transcribed_text = transcribe_audio(audio)
    # Compare the original paragraph with the transcribed text
    comparison_result = compare_texts(paragraph, transcribed_text)

    # Return comparison result
    return comparison_result
    
# Gradio Interface using the updated API
interface = gr.Interface(
    fn=gradio_function, 
    inputs=[
        gr.Textbox(lines=5, label="Input Paragraph"),
        gr.Audio(type="filepath", label="Record Audio")
    ], 
    outputs=["html"],
    title="Speech Recognition Comparison",
    description="Input a paragraph, record your audio, and compare the transcription to the original text."
)

# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
    outputs=gr.Audio(label="Text-to-Speech Output"),
    title="Text-to-Speech",
    description="This tool will read your input paragraph aloud."
)

# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

# Launch Gradio app
demo.launch()