import os
import requests
import speech_recognition as sr
import difflib
import gradio as gr
from gtts import gTTS
import io
from pydub import AudioSegment
import time
import pronouncing
import epitran

# Create audio directory if it doesn't exist
if not os.path.exists('audio'):
    os.makedirs('audio')

# Initialize the epitran object for English
try:
    epi = epitran.Epitran('eng-Latn')
except Exception as e:
    print(f"Error initializing Epitran: {e}")

# Step 1: Transcribe the audio file
def transcribe_audio(audio):
    if audio is None:
        return "No audio file provided."
    
    recognizer = sr.Recognizer()

    # Check if the file exists
    if not os.path.isfile(audio):
        return "Audio file not found."

    audio_format = audio.split('.')[-1].lower()
    
    if audio_format != 'wav':
        try:
            audio_segment = AudioSegment.from_file(audio)
            wav_path = audio.replace(audio_format, 'wav')
            audio_segment.export(wav_path, format='wav')
            audio = wav_path
        except Exception as e:
            return f"Error converting audio: {e}"

    audio_file = sr.AudioFile(audio)
    with audio_file as source:
        audio_data = recognizer.record(source)

    try:
        transcription = recognizer.recognize_google(audio_data)
        return transcription
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand the audio"
    except sr.RequestError as e:
        return f"Error with Google Speech Recognition service: {e}"

# Step 2: Create pronunciation audio for incorrect words (locally)
def create_pronunciation_audio(word):
    try:
        tts = gTTS(word)
        audio_file_path = f"audio/{word}.mp3"
        tts.save(audio_file_path)
        return audio_file_path  # Return the local path instead of uploading
    except Exception as e:
        return f"Failed to create pronunciation audio: {e}"

# Function for phonetic respelling
def phonetic_respelling(sentence):
    words = sentence.split()
    respelled = []
    
    for word in words:
        # Find close matches for each word
        close_matches = pronouncing.search(word)
        if close_matches:
            # Get the first close match
            closest_word = close_matches[0]
            respelled.append(pronouncing.phones_for_word(closest_word)[0])  # Use phonemes for the closest match
        else:
            respelled.append(word)

    # Convert phonemes to respelling
    respelling = ' '.join(respelled)
    
    # Replace phonemes with common respellings
    respelling = respelling.replace('ˈ', '').replace('ˌ', '').replace('ː', '')  # Clean up phoneme symbols
    respelling = respelling.replace('ɑ', 'a').replace('ə', 'uh').replace('ɪ', 'i').replace('ʊ', 'u')  # Sample conversions
    
    return respelling

# Function for IPA transcription
def ipa_transcription(sentence):
    try:
        return epi.transliterate(sentence)
    except Exception as e:
        print(f"Error during IPA transcription: {e}")
        return "IPA transcription failed."

# Step 3: Compare the transcribed text with the input paragraph
def compare_texts(reference_text, transcribed_text):
    reference_words = reference_text.split()
    transcribed_words = transcribed_text.split()
    incorrect_words_audios = []  # Store audio paths for incorrect words

    sm = difflib.SequenceMatcher(None, reference_text, transcribed_text)
    similarity_score = round(sm.ratio() * 100, 2)

    # Construct HTML output with detailed fidelity class
    html_output = f"<strong>Fidelity Class:</strong> "
    if similarity_score >= 85:
        html_output += f"<strong>GOOD (>=85%)</strong><br>"
    elif similarity_score >= 70:
        html_output += f"<strong>ACCEPTABLE (70% - 85%)</strong><br>"
    elif similarity_score >= 50:
        html_output += f"<strong>NEEDS IMPROVEMENT (50% - 70%)</strong><br>"
    else:
        html_output += f"<strong>POOR (<50%)</strong><br>"

    html_output += f"<strong>Quality Score:</strong> {similarity_score}%<br>"
    html_output += f"<strong>Transcribed Text:</strong> {transcribed_text}<br>"
    html_output += f"<strong>Input Sentence:</strong> {reference_text}<br>"
    html_output += f"<strong>Phonetic Respelling:</strong> {phonetic_respelling(reference_text)}<br>"
    html_output += f"<strong>IPA Transcription:</strong> {ipa_transcription(reference_text)}<br>"
    html_output += "<strong>Word Score List:</strong><br>"

    # Generate colored word score list
    for i, word in enumerate(reference_words):
        try:
            if word.lower() == transcribed_words[i].lower():
                html_output += f'<span style="color: green;">{word}</span> '  # Correct words in green
            elif difflib.get_close_matches(word, [transcribed_words[i]]):
                html_output += f'<span style="color: yellow;">{word}</span> '  # Close matches in yellow
            else:
                # Incorrect words in red
                html_output += f'<span style="color: red;">{word}</span> '
                # Create pronunciation audio for the incorrect word
                audio_file_path = create_pronunciation_audio(word)
                incorrect_words_audios.append((word, audio_file_path))
        except IndexError:
            # Word in reference that was not transcribed
            html_output += f'<span style="color: red;">{word}</span> '

    # Provide audio for incorrect words
    if incorrect_words_audios:
        html_output += "<br><strong>Pronunciation for Incorrect Words:</strong><br>"
        for word, audio in incorrect_words_audios:
            suggestion = difflib.get_close_matches(word, reference_words, n=1)
            suggestion_text = f" (Did you mean: <em>{suggestion[0]}</em>?)" if suggestion else ""
            up_audio = upfilepath(audio)
            audio_src = f"https://mr2along-speech-recognize.hf.space/gradio_api/file={up_audio}"
            html_output += f'{word}: '
            html_output += f'<audio controls><source src="{audio_src}" type="audio/mpeg">Your browser does not support the audio tag.</audio>{suggestion_text}<br>'

    return [html_output]

# Step 4: Text-to-Speech Function
def text_to_speech(paragraph):
    if not paragraph:
        return None  # Handle the case when no text is provided
    
    tts = gTTS(paragraph)
    audio_file_path = "audio/paragraph.mp3"  # Save the audio to a file
    tts.save(audio_file_path)
    return audio_file_path  # Return the file path

# Gradio Interface Function
def gradio_function(paragraph, audio):
    # Transcribe the audio
    transcribed_text = transcribe_audio(audio)
    # Compare the original paragraph with the transcribed text
    comparison_result = compare_texts(paragraph, transcribed_text)

    # Return comparison result
    return comparison_result
    
# Gradio Interface using the updated API
interface = gr.Interface(
    fn=gradio_function, 
    inputs=[
        gr.Textbox(lines=5, label="Input Paragraph"),
        gr.Audio(type="filepath", label="Record Audio")
    ], 
    outputs=["html"],
    title="Speech Recognition Comparison",
    description="Input a paragraph, record your audio, and compare the transcription to the original text."
)

# Gradio Interface for Text-to-Speech
tts_interface = gr.Interface(
    fn=text_to_speech,
    inputs=gr.Textbox(lines=5, label="Input Paragraph to Read Aloud"),
    outputs=gr.Audio(label="Text-to-Speech Output"),
    title="Text-to-Speech",
    description="This tool will read your input paragraph aloud."
)

# Combine both interfaces into one
demo = gr.TabbedInterface([interface, tts_interface], ["Speech Recognition", "Text-to-Speech"])

# Launch Gradio app
demo.launch()