Spaces:

shukdevdatta123
/

GPT-4.5-Multimodal-Chatbot

Running

App Files Files Community

shukdevdatta123 commited on Mar 22

Commit

cc3538d

verified ·

1 Parent(s): 9f74220

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -24

app.py CHANGED Viewed

@@ -44,7 +44,7 @@ def image_url_chat(image_url, text_query, temperature, top_p, max_output_tokens)
     messages = [
         {"role": "user", "content": [
-            {"type": "image_url", "image_url": {"url": image_url}},  # Corrected format
             {"type": "text", "text": text_query}
         ]},
     ]
@@ -71,7 +71,7 @@ def image_chat(image_file, text_query, temperature, top_p, max_output_tokens):
     messages = [
         {"role": "user", "content": [
-            {"type": "image_url", "image_url": {"url": image_data}},  # Fixed format
             {"type": "text", "text": text_query}
         ]},
     ]
@@ -121,9 +121,59 @@ def transcribe_audio(audio_binary, openai_api_key):
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
-# Function to clear the chat (Fix: Returns the correct number of outputs)
 def clear_chat():
-    return "", "", "", "", "", "", "", None, "", None, "", 1.0, 1.0, 2048
 # Gradio UI Layout
 with gr.Blocks() as demo:
@@ -155,10 +205,10 @@ with gr.Blocks() as demo:
         #clear_chat_button:hover {
             background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%); /* Slightly darker red gradient on hover */
         }
-        #ask_button {
             background: linear-gradient(135deg, #fbd38d 0%, #f6e05e 100%); /* Yellow gradient */
         }
-        #ask_button:hover {
             background: linear-gradient(135deg, #ecc94b 0%, #fbd38d 100%); /* Slightly darker yellow gradient on hover */
         }
     </style>
@@ -173,40 +223,54 @@ with gr.Blocks() as demo:
     with gr.Row():
         temperature = gr.Slider(0, 2, value=1.0, step=0.1, label="Temperature")
         top_p = gr.Slider(0, 1, value=1.0, step=0.1, label="Top-P")
-        max_output_tokens = gr.Slider(0, 16384, value=2048, step=512, label="Max Output Tokens")  # Changed default to 2048
     with gr.Tabs():
         with gr.Tab("Image URL Chat"):
             image_url = gr.Textbox(label="Enter Image URL")
             image_query = gr.Textbox(label="Ask about the Image")
             image_url_output = gr.Textbox(label="Response", interactive=False)
-            image_url_button = gr.Button("Ask",elem_id="ask_button")
         with gr.Tab("Text Chat"):
             text_query = gr.Textbox(label="Enter your query")
             text_output = gr.Textbox(label="Response", interactive=False)
-            text_button = gr.Button("Ask",elem_id="ask_button")
         with gr.Tab("Image Chat"):
             image_upload = gr.File(label="Upload an Image", type="filepath")
             image_text_query = gr.Textbox(label="Ask about the uploaded image")
             image_output = gr.Textbox(label="Response", interactive=False)
-            image_button = gr.Button("Ask",elem_id="ask_button")
         with gr.Tab("PDF Chat"):
             pdf_upload = gr.File(label="Upload a PDF", type="filepath")
             pdf_text_query = gr.Textbox(label="Ask about the uploaded PDF")
             pdf_output = gr.Textbox(label="Response", interactive=False)
-            pdf_button = gr.Button("Ask",elem_id="ask_button")
         with gr.Tab("Voice Chat"):
-            audio_upload = gr.File(label="Upload an Audio File", type="binary")
-            audio_query = gr.Textbox(label="Ask about the transcription")
-            audio_output = gr.Textbox(label="Response", interactive=False)
-            audio_button = gr.Button("Ask",elem_id="ask_button")
     # Clear chat button
-    clear_button = gr.Button("Clear Chat",elem_id="clear_chat_button")
     # Button Click Actions
     api_key_button.click(set_api_key, inputs=[api_key_input], outputs=[api_key_output])
@@ -215,22 +279,42 @@ with gr.Blocks() as demo:
     image_button.click(image_chat, [image_upload, image_text_query, temperature, top_p, max_output_tokens], image_output)
     pdf_button.click(pdf_chat, [pdf_upload, pdf_text_query, temperature, top_p, max_output_tokens], pdf_output)
-    # For Voice Chat
-    audio_button.click(
-        lambda audio_binary, query, temperature, top_p, max_output_tokens: query_openai(
-            [{"role": "user", "content": [{"type": "text", "text": transcribe_audio(audio_binary, api_key)}, {"type": "text", "text": query}]}],
-            temperature, top_p, max_output_tokens
-        ), [audio_upload, audio_query, temperature, top_p, max_output_tokens], audio_output
     )
-    # Fix: Clear button resets all necessary fields correctly
     clear_button.click(
         clear_chat,
         outputs=[
             image_url, image_query, image_url_output,
             text_query, text_output,
             image_text_query, image_output,
-            pdf_upload, pdf_text_query, pdf_output,
             temperature, top_p, max_output_tokens
         ]
     )

     messages = [
         {"role": "user", "content": [
+            {"type": "image_url", "image_url": {"url": image_url}},
             {"type": "text", "text": text_query}
         ]},
     ]
     messages = [
         {"role": "user", "content": [
+            {"type": "image_url", "image_url": {"url": image_data}},
             {"type": "text", "text": text_query}
         ]},
     ]
     except Exception as e:
         return f"Error transcribing audio: {str(e)}"
+# Function to handle uploaded audio transcription
+def process_uploaded_audio(audio_binary):
+    if not audio_binary:
+        return "Please upload an audio file first."
+    if not api_key:
+        return "Please enter your OpenAI API key first."
+    try:
+        transcription = transcribe_audio(audio_binary, api_key)
+        return transcription
+    except Exception as e:
+        return f"Error transcribing audio: {str(e)}"
+# Function to handle recorded audio transcription
+def process_recorded_audio(audio_path):
+    if not audio_path:
+        return "No audio recorded."
+    if not api_key:
+        return "Please enter your OpenAI API key first."
+    try:
+        with open(audio_path, "rb") as audio_file:
+            audio_binary = audio_file.read()
+        transcription = transcribe_audio(audio_binary, api_key)
+        return transcription
+    except Exception as e:
+        return f"Error transcribing recorded audio: {str(e)}"
+# Function to process the voice chat queries
+def process_voice_query(transcription, query, temperature, top_p, max_output_tokens):
+    if not transcription or transcription.startswith("Error") or transcription.startswith("Please"):
+        return "Please ensure audio is transcribed successfully first."
+    if not query:
+        # If no specific query is provided, use the transcription as the query
+        messages = [{"role": "user", "content": [{"type": "text", "text": transcription}]}]
+    else:
+        # If a query about the transcription is provided
+        messages = [
+            {"role": "user", "content": [
+                {"type": "text", "text": f"Transcription: {transcription}"},
+                {"type": "text", "text": f"Query: {query}"}
+            ]}
+        ]
+    return query_openai(messages, temperature, top_p, max_output_tokens)
+# Function to clear the chat
 def clear_chat():
+    return "", "", "", "", "", "", "", None, "", None, "", None, "", "", "", 1.0, 1.0, 2048
 # Gradio UI Layout
 with gr.Blocks() as demo:
         #clear_chat_button:hover {
             background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%); /* Slightly darker red gradient on hover */
         }
+        #ask_button, #transcribe_button {
             background: linear-gradient(135deg, #fbd38d 0%, #f6e05e 100%); /* Yellow gradient */
         }
+        #ask_button:hover, #transcribe_button:hover {
             background: linear-gradient(135deg, #ecc94b 0%, #fbd38d 100%); /* Slightly darker yellow gradient on hover */
         }
     </style>
     with gr.Row():
         temperature = gr.Slider(0, 2, value=1.0, step=0.1, label="Temperature")
         top_p = gr.Slider(0, 1, value=1.0, step=0.1, label="Top-P")
+        max_output_tokens = gr.Slider(0, 16384, value=2048, step=512, label="Max Output Tokens")
     with gr.Tabs():
         with gr.Tab("Image URL Chat"):
             image_url = gr.Textbox(label="Enter Image URL")
             image_query = gr.Textbox(label="Ask about the Image")
             image_url_output = gr.Textbox(label="Response", interactive=False)
+            image_url_button = gr.Button("Ask", elem_id="ask_button")
         with gr.Tab("Text Chat"):
             text_query = gr.Textbox(label="Enter your query")
             text_output = gr.Textbox(label="Response", interactive=False)
+            text_button = gr.Button("Ask", elem_id="ask_button")
         with gr.Tab("Image Chat"):
             image_upload = gr.File(label="Upload an Image", type="filepath")
             image_text_query = gr.Textbox(label="Ask about the uploaded image")
             image_output = gr.Textbox(label="Response", interactive=False)
+            image_button = gr.Button("Ask", elem_id="ask_button")
         with gr.Tab("PDF Chat"):
             pdf_upload = gr.File(label="Upload a PDF", type="filepath")
             pdf_text_query = gr.Textbox(label="Ask about the uploaded PDF")
             pdf_output = gr.Textbox(label="Response", interactive=False)
+            pdf_button = gr.Button("Ask", elem_id="ask_button")
         with gr.Tab("Voice Chat"):
+            with gr.Tabs():
+                with gr.Tab("Upload Audio"):
+                    # Upload audio section
+                    audio_upload = gr.File(label="Upload an Audio File", type="binary")
+                    upload_transcribe_button = gr.Button("Transcribe Audio", elem_id="transcribe_button")
+                    upload_transcription = gr.Textbox(label="Transcription", interactive=False)
+                    upload_audio_query = gr.Textbox(label="Ask about the transcription (optional)")
+                    upload_audio_output = gr.Textbox(label="Response", interactive=False)
+                    upload_audio_button = gr.Button("Ask", elem_id="ask_button")
+                with gr.Tab("Record Audio"):
+                    # Record audio section
+                    audio_recorder = gr.Audio(source="microphone", type="filepath", label="Record your voice")
+                    record_transcribe_button = gr.Button("Transcribe Recording", elem_id="transcribe_button")
+                    record_transcription = gr.Textbox(label="Transcription", interactive=False)
+                    record_audio_query = gr.Textbox(label="Ask about the transcription (optional)")
+                    record_audio_output = gr.Textbox(label="Response", interactive=False)
+                    record_audio_button = gr.Button("Ask", elem_id="ask_button")
     # Clear chat button
+    clear_button = gr.Button("Clear Chat", elem_id="clear_chat_button")
     # Button Click Actions
     api_key_button.click(set_api_key, inputs=[api_key_input], outputs=[api_key_output])
     image_button.click(image_chat, [image_upload, image_text_query, temperature, top_p, max_output_tokens], image_output)
     pdf_button.click(pdf_chat, [pdf_upload, pdf_text_query, temperature, top_p, max_output_tokens], pdf_output)
+    # Voice Chat - Upload Audio tab actions
+    upload_transcribe_button.click(
+        process_uploaded_audio,
+        inputs=[audio_upload],
+        outputs=[upload_transcription]
+    )
+    upload_audio_button.click(
+        process_voice_query,
+        inputs=[upload_transcription, upload_audio_query, temperature, top_p, max_output_tokens],
+        outputs=[upload_audio_output]
+    )
+    # Voice Chat - Record Audio tab actions
+    record_transcribe_button.click(
+        process_recorded_audio,
+        inputs=[audio_recorder],
+        outputs=[record_transcription]
+    )
+    record_audio_button.click(
+        process_voice_query,
+        inputs=[record_transcription, record_audio_query, temperature, top_p, max_output_tokens],
+        outputs=[record_audio_output]
     )
+    # Clear button resets all necessary fields
     clear_button.click(
         clear_chat,
         outputs=[
             image_url, image_query, image_url_output,
             text_query, text_output,
             image_text_query, image_output,
+            pdf_upload, pdf_text_query, pdf_output,
+            audio_upload, upload_transcription, upload_audio_query, upload_audio_output,
+            audio_recorder, record_transcription, record_audio_query, record_audio_output,
             temperature, top_p, max_output_tokens
         ]
     )