Spaces:

shukdevdatta123
/

GPT-4.5-Multimodal-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 15

Commit

f2955b8

verified ·

1 Parent(s): a56f35d

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -24

app.py CHANGED Viewed

@@ -104,16 +104,19 @@ def pdf_chat(pdf_file, text_query, temperature, top_p, max_output_tokens):
         return f"Error processing the PDF: {str(e)}"
 # Function to transcribe audio to text using OpenAI Whisper API
-def transcribe_audio(audio_filepath, openai_api_key):
     if not openai_api_key:
         return "Error: No API key provided."
     openai.api_key = openai_api_key
     try:
-        # Open the audio file and transcribe using OpenAI's Whisper model
-        with open(audio_filepath, "rb") as audio_file:
-            audio_file_transcription = openai.Audio.transcribe(file=audio_file, model="whisper-1")
         return audio_file_transcription.text
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
@@ -170,10 +173,7 @@ with gr.Blocks() as demo:
             pdf_button = gr.Button("Ask")
         with gr.Tab("Voice Chat"):
-            # Record Audio Component for Voice Chat
-            audio_record = gr.Audio(label="Record your Voice", type="filepath", show_label=True)
-            # Upload Audio File Component
-            audio_upload = gr.File(label="Or Upload an Audio File", type="file", file_types=["audio/wav", "audio/mp3"])
             audio_query = gr.Textbox(label="Ask about the transcription")
             audio_output = gr.Textbox(label="Response", interactive=False)
             audio_button = gr.Button("Ask")
@@ -188,27 +188,18 @@ with gr.Blocks() as demo:
     image_button.click(image_chat, [image_upload, image_text_query, temperature, top_p, max_output_tokens], image_output)
     pdf_button.click(pdf_chat, [pdf_upload, pdf_text_query, temperature, top_p, max_output_tokens], pdf_output)
-    # For Voice Chat (record or upload audio and process query)
-    def process_audio(audio, query, temperature, top_p, max_output_tokens):
-        # Check if audio is recorded or uploaded
-        if audio is None:
-            return "Please either record or upload an audio file."
-        # Process the audio (either from recording or upload)
-        transcription = transcribe_audio(audio.name, api_key)
-        if transcription.startswith("Error"):
-            return transcription  # Return transcription error
-        return query_openai(
-            [{"role": "user", "content": [{"type": "text", "text": transcription}, {"type": "text", "text": query}]}],
             temperature, top_p, max_output_tokens
-        )
-    audio_button.click(process_audio, [audio_record, audio_upload, audio_query, temperature, top_p, max_output_tokens], audio_output)
     # Fix: Clear button resets all necessary fields correctly
     clear_button.click(
         clear_chat,
-        outputs=[
             image_url, image_query, image_url_output,
             text_query, text_output,
             image_text_query, image_output,

         return f"Error processing the PDF: {str(e)}"
 # Function to transcribe audio to text using OpenAI Whisper API
+def transcribe_audio(audio_binary, openai_api_key):
     if not openai_api_key:
         return "Error: No API key provided."
     openai.api_key = openai_api_key
     try:
+        # Use the correct transcription API call
+        audio_file_obj = io.BytesIO(audio_binary)
+        audio_file_obj.name = 'audio.wav'  # Set a name for the file object (as OpenAI expects it)
+        # Transcribe the audio to text using OpenAI's whisper model
+        audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
         return audio_file_transcription.text
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
             pdf_button = gr.Button("Ask")
         with gr.Tab("Voice Chat"):
+            audio_upload = gr.File(label="Upload an Audio File", type="binary")
             audio_query = gr.Textbox(label="Ask about the transcription")
             audio_output = gr.Textbox(label="Response", interactive=False)
             audio_button = gr.Button("Ask")
     image_button.click(image_chat, [image_upload, image_text_query, temperature, top_p, max_output_tokens], image_output)
     pdf_button.click(pdf_chat, [pdf_upload, pdf_text_query, temperature, top_p, max_output_tokens], pdf_output)
+    # For Voice Chat
+    audio_button.click(
+        lambda audio_binary, query, temperature, top_p, max_output_tokens: query_openai(
+            [{"role": "user", "content": [{"type": "text", "text": transcribe_audio(audio_binary, api_key)}, {"type": "text", "text": query}]}],
             temperature, top_p, max_output_tokens
+        ), [audio_upload, audio_query, temperature, top_p, max_output_tokens], audio_output
+    )
     # Fix: Clear button resets all necessary fields correctly
     clear_button.click(
         clear_chat,
+        outputs=[
             image_url, image_query, image_url_output,
             text_query, text_output,
             image_text_query, image_output,