import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import gradio as gr

# Load model and processor
model_id = "sanket003/whisper-darpg"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch.float32, low_cpu_mem_usage=False, use_safetensors=True
)
processor = AutoProcessor.from_pretrained(model_id)

# Define the pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float32,
    generate_kwargs={"language": "english"},
    return_timestamps=True
)

# Define the Gradio interface function
def transcribe_audio(audio, file):
    if audio:
        result = pipe(audio)
    elif file:
        result = pipe(file)
    else:
        result = {"text": "No input provided."}
    return result["text"]

# Gradio interface
iface = gr.Interface(
    title="Transforming Speech into Text",
    fn=transcribe_audio,
    inputs=[
        gr.Audio(source="microphone", type="filepath", label="Record from Microphone"),
        gr.File(type="filepath", label="Upload Audio File"),
    ],
    outputs=["textbox"],
    description="Choose either microphone input or upload an audio file.",
)

# Run the app
if __name__ == "__main__":
    iface.launch(share=True, debug=True)