import gradio as gr from transformers import pipeline from gtts import gTTS import tempfile import os # Initialize the speech-to-text transcriber transcriber = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-english") # Initialize the question-answering model qa_model = pipeline("question-answering", model="AVISHKAARAM/avishkaarak-ekta-hindi") def answer_question(context, question=None, audio=None): try: # If audio is provided, transcribe it if audio: transcription_result = transcriber(audio)["text"] question_text = transcription_result else: question_text = question # Generate an answer to the question qa_result = qa_model(question=question_text, context=context) answer = qa_result["answer"] # Convert the answer to speech tts = gTTS(text=answer, lang="en") audio_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name tts.save(audio_path) return answer, audio_path except Exception as e: return str(e), None # Define the Gradio interface context_input = gr.Textbox(label="Context", lines=3) question_input = gr.Textbox(label="Question") audio_input = gr.Audio(type="filepath", label="Question (Audio Input)") output_text = gr.Textbox(label="Answer") output_audio = gr.Audio(label="Answer (Audio Output)") interface = gr.Interface( fn=answer_question, inputs=[context_input, question_input, audio_input], outputs=[output_text, output_audio], title="Multimodal Question Answering", description="Provide a context and either a text question or an audio question to get an answer.", examples=[ ["The capital of France is Paris.", "What is the capital of France?", None], ["OpenAI is famous for developing GPT-3.", "What is OpenAI known for?", None], ], ) # Launch the Gradio app if __name__ == "__main__": interface.launch()