shukdevdatta123 commited on
Commit
b39aa5d
·
verified ·
1 Parent(s): 8faefa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +559 -521
app.py CHANGED
@@ -1,522 +1,560 @@
1
- import gradio as gr
2
- import openai
3
- import base64
4
- from PIL import Image
5
- import io
6
- import os
7
- import tempfile
8
- import fitz # PyMuPDF for PDF handling
9
-
10
- # Function to extract text from PDF files
11
- def extract_text_from_pdf(pdf_file):
12
- try:
13
- text = ""
14
- pdf_document = fitz.open(pdf_file)
15
-
16
- for page_num in range(len(pdf_document)):
17
- page = pdf_document[page_num]
18
- text += page.get_text()
19
-
20
- pdf_document.close()
21
- return text
22
- except Exception as e:
23
- return f"Error extracting text from PDF: {str(e)}"
24
-
25
- # Function to generate MCQ quiz from PDF content
26
- def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
27
- if not openai_api_key:
28
- return "Error: No API key provided."
29
-
30
- openai.api_key = openai_api_key
31
-
32
- # Limit content length to avoid token limits
33
- limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content
34
-
35
- prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
36
- For each question:
37
- 1. Create a clear question based on key concepts in the document
38
- 2. Provide 4 possible answers (A, B, C, D)
39
- 3. Indicate the correct answer
40
- 4. Briefly explain why the answer is correct
41
-
42
- Format the output clearly with each question numbered and separated.
43
-
44
- Document content:
45
- {limited_content}
46
- """
47
-
48
- try:
49
- messages = [
50
- {"role": "user", "content": prompt}
51
- ]
52
-
53
- response = openai.ChatCompletion.create(
54
- model=model_choice,
55
- messages=messages
56
- )
57
-
58
- return response.choices[0].message.content
59
- except Exception as e:
60
- return f"Error generating quiz: {str(e)}"
61
-
62
- # Function to send the request to OpenAI API with an image, text or PDF input
63
- def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"):
64
- if not openai_api_key:
65
- return "Error: No API key provided."
66
-
67
- openai.api_key = openai_api_key
68
-
69
- # Process the input depending on whether it's text, image, or a PDF-related query
70
- if pdf_content and input_text:
71
- # For PDF queries, we combine the PDF content with the user's question
72
- prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}"
73
- input_content = prompt
74
- elif image:
75
- # Convert the image to base64 string
76
- image_info = get_base64_string_from_image(image)
77
- input_content = f"data:image/png;base64,{image_info}"
78
- else:
79
- # Plain text input
80
- input_content = input_text
81
-
82
- # Prepare the messages for OpenAI API
83
- if model_choice == "o1":
84
- if image and not pdf_content:
85
- messages = [
86
- {"role": "user", "content": [{"type": "image_url", "image_url": {"url": input_content}}]}
87
- ]
88
- else:
89
- messages = [
90
- {"role": "user", "content": input_content}
91
- ]
92
- elif model_choice == "o3-mini":
93
- messages = [
94
- {"role": "user", "content": input_content}
95
- ]
96
-
97
- try:
98
- # Call OpenAI API with the selected model
99
- response = openai.ChatCompletion.create(
100
- model=model_choice,
101
- messages=messages,
102
- max_completion_tokens=2000
103
- )
104
-
105
- return response.choices[0].message.content
106
- except Exception as e:
107
- return f"Error calling OpenAI API: {str(e)}"
108
-
109
- # Function to convert an uploaded image to a base64 string
110
- def get_base64_string_from_image(pil_image):
111
- # Convert PIL Image to bytes
112
- buffered = io.BytesIO()
113
- pil_image.save(buffered, format="PNG")
114
- img_bytes = buffered.getvalue()
115
- base64_str = base64.b64encode(img_bytes).decode("utf-8")
116
- return base64_str
117
-
118
- # Function to transcribe audio to text using OpenAI Whisper API
119
- def transcribe_audio(audio, openai_api_key):
120
- if not openai_api_key:
121
- return "Error: No API key provided."
122
-
123
- openai.api_key = openai_api_key
124
-
125
- try:
126
- # Open the audio file and pass it as a file object
127
- with open(audio, 'rb') as audio_file:
128
- audio_file_content = audio_file.read()
129
-
130
- # Use the correct transcription API call
131
- audio_file_obj = io.BytesIO(audio_file_content)
132
- audio_file_obj.name = 'audio.wav' # Set a name for the file object (as OpenAI expects it)
133
-
134
- # Transcribe the audio to text using OpenAI's whisper model
135
- audio_file_transcription = openai.Audio.transcribe(file=audio_file_obj, model="whisper-1")
136
- return audio_file_transcription.text
137
- except Exception as e:
138
- return f"Error transcribing audio: {str(e)}"
139
-
140
- # The function that will be used by Gradio interface
141
- def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, history):
142
- if history is None:
143
- history = []
144
-
145
- # If there's audio, transcribe it to text
146
- if audio:
147
- input_text = transcribe_audio(audio, openai_api_key)
148
-
149
- # If a new PDF is uploaded, extract its text
150
- new_pdf_content = pdf_content
151
- if pdf_file is not None:
152
- new_pdf_content = extract_text_from_pdf(pdf_file)
153
-
154
- # Check if we're in PDF quiz mode
155
- if pdf_quiz_mode:
156
- if new_pdf_content:
157
- # Generate MCQ quiz questions
158
- quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
159
- history.append((f"👤: [Uploaded PDF for Quiz - {int(num_quiz_questions)} questions]", f"🤖: {quiz_response}"))
160
- else:
161
- history.append(("👤: [Attempted to generate quiz without PDF]", "🤖: Please upload a PDF file to generate quiz questions."))
162
- else:
163
- # Regular chat mode - generate the response
164
- response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
165
-
166
- # Append the response to the history
167
- if input_text:
168
- history.append((f"👤: {input_text}", f"🤖: {response}"))
169
- elif image is not None:
170
- history.append((f"👤: [Uploaded image]", f"🤖: {response}"))
171
- elif pdf_file is not None:
172
- history.append((f"👤: [Uploaded PDF]", f"🤖: {response}"))
173
- else:
174
- history.append((f"👤: [No input provided]", f"🤖: Please provide some input (text, image, or PDF) for me to respond to."))
175
-
176
- return "", None, None, None, new_pdf_content, history
177
-
178
- # Function to clear the chat history and PDF content
179
- def clear_history():
180
- return "", None, None, None, "", []
181
-
182
- # Function to process a newly uploaded PDF
183
- def process_pdf(pdf_file):
184
- if pdf_file is None:
185
- return ""
186
- return extract_text_from_pdf(pdf_file)
187
-
188
- # Function to update visible components based on input type selection
189
- def update_input_type(choice):
190
- if choice == "Text":
191
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
192
- elif choice == "Image":
193
- return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
194
- elif choice == "Voice":
195
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(value=False)
196
- elif choice == "PDF":
197
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(value=False)
198
- elif choice == "PDF(QUIZ)":
199
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True), gr.update(value=True)
200
-
201
- # Custom CSS styles with animations and button colors
202
- custom_css = """
203
- /* General body styles */
204
- .gradio-container {
205
- font-family: 'Arial', sans-serif;
206
- background-color: #f0f4f8; /* Lighter blue-gray background */
207
- color: #2d3748;;
208
- }
209
- /* Header styles */
210
- .gradio-header {
211
- background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
212
- color: white;
213
- padding: 20px;
214
- text-align: center;
215
- border-radius: 8px;
216
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
217
- animation: fadeIn 1s ease-out;
218
- }
219
- .gradio-header h1 {
220
- font-size: 2.5rem;
221
- }
222
- .gradio-header h3 {
223
- font-size: 1.2rem;
224
- margin-top: 10px;
225
- }
226
- /* Chatbot container styles */
227
- .gradio-chatbot {
228
- background-color: #fff;
229
- border-radius: 10px;
230
- padding: 20px;
231
- box-shadow: 0 6px 18px rgba(0, 0, 0, 0.1);
232
- border-left: 4px solid #4a00e0; /* Accent border */
233
- }
234
- /* Input field styles */
235
- .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file, .gradio-slider {
236
- border-radius: 8px;
237
- border: 2px solid #e2e8f0;
238
- background-color: #f8fafc;
239
- }
240
- .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus, .gradio-slider:focus {
241
- border-color: #8e2de2;
242
- box-shadow: 0 0 0 3px rgba(142, 45, 226, 0.2);
243
- }
244
- /* Button styles */
245
- /* Send Button: Sky Blue */
246
- #submit-btn {
247
- background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
248
- color: white;
249
- border: none;
250
- border-radius: 8px;
251
- padding: 10px 19px;
252
- font-size: 1.1rem;
253
- cursor: pointer;
254
- transition: all 0.3s ease;
255
- margin-left: auto;
256
- margin-right: auto;
257
- display: block;
258
- margin-top: 10px;
259
- }
260
- #submit-btn:hover {
261
- background: linear-gradient(135deg, #5b10f1 0%, #9f3ef3 100%); /* Slightly lighter */
262
- box-shadow: 0 6px 8px rgba(74, 0, 224, 0.4);
263
- }
264
- #submit-btn:active {
265
- transform: scale(0.95);
266
- }
267
- #clear-history {
268
- background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%); /* Red gradient */
269
- color: white;
270
- border: none;
271
- border-radius: 8px;
272
- padding: 10px 13px;
273
- font-size: 1.1rem;
274
- cursor: pointer;
275
- transition: all 0.3s ease;
276
- margin-top: 10px;
277
- }
278
- #clear-history:hover {
279
- background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%); /* Slightly darker red gradient on hover */
280
- box-shadow: 0 6px 8px rgba(229, 62, 62, 0.4);
281
- }
282
- #clear-history:active {
283
- transform: scale(0.95);
284
- }
285
- /* Input type selector buttons */
286
- #input-type-group {
287
- display: flex;
288
- justify-content: center;
289
- gap: 10px;
290
- margin-bottom: 20px;
291
- }
292
- .input-type-btn {
293
- background-color: #718096; /* Slate gray */
294
- color: white;
295
- border: none;
296
- border-radius: 8px;
297
- padding: 10px 15px;
298
- font-size: 1rem;
299
- cursor: pointer;
300
- transition: all 0.3s ease;
301
- }
302
- .input-type-btn.selected {
303
- background-color: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
304
- }
305
- .input-type-btn:hover {
306
- background-color: #4a5568; /* Darker slate */
307
- }
308
- /* Chat history styles */
309
- .gradio-chatbot .message {
310
- margin-bottom: 10px;
311
- }
312
- .gradio-chatbot .user {
313
- background-color: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
314
- color: white;
315
- padding: 10px;
316
- border-radius: 12px;
317
- max-width: 70%;
318
- animation: slideInUser 0.5s ease-out;
319
- }
320
- .gradio-chatbot .assistant {
321
- background-color: #f0f4f8; /* Light blue-gray */
322
- color: #2d3748;
323
- padding: 10px;
324
- border-radius: 12px;
325
- max-width: 70%;
326
- margin-left: auto;
327
- animation: slideInAssistant 0.5s ease-out;
328
- }
329
- /* Animation keyframes */
330
- @keyframes fadeIn {
331
- 0% { opacity: 0; }
332
- 100% { opacity: 1; }
333
- }
334
- @keyframes slideInUser {
335
- 0% { transform: translateX(-100%); }
336
- 100% { transform: translateX(0); }
337
- }
338
- @keyframes slideInAssistant {
339
- 0% { transform: translateX(100%); }
340
- 100% { transform: translateX(0); }
341
- }
342
- /* Mobile responsiveness */
343
- @media (max-width: 768px) {
344
- .gradio-header h1 {
345
- font-size: 1.8rem;
346
- }
347
- .gradio-header h3 {
348
- font-size: 1rem;
349
- }
350
- .gradio-chatbot {
351
- max-height: 400px;
352
- }
353
- .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file, .gradio-slider {
354
- width: 100%;
355
- }
356
- #submit-btn, #clear-history {
357
- width: 100%;
358
- margin-left: 0;
359
- }
360
- }
361
- """
362
-
363
- # Gradio interface setup
364
- def create_interface():
365
- with gr.Blocks(css=custom_css) as demo:
366
- gr.Markdown("""
367
- <div class="gradio-header">
368
- <h1>Multimodal Chatbot (Text + Image + Voice + PDF + Quiz)</h1>
369
- <h3>Interact with a chatbot using text, image, voice, or PDF inputs</h3>
370
- </div>
371
- """)
372
-
373
- # Add a description with an expandable accordion
374
- with gr.Accordion("Click to expand for details", open=False):
375
- gr.Markdown("""
376
- ### Description:
377
- This is a multimodal chatbot that can handle text, image, voice, PDF inputs, and generate quizzes from PDFs.
378
- - You can ask questions or provide text, and the assistant will respond.
379
- - You can upload an image, and the assistant will process it and answer questions about the image.
380
- - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
381
- - PDF support: Upload a PDF and ask questions about its content.
382
- - PDF Quiz: Upload a PDF and specify how many MCQ questions you want generated based on the content.
383
- - Enter your OpenAI API key to start interacting with the model.
384
- - You can use the 'Clear History' button to remove the conversation history.
385
- - "o1" is for image, voice, PDF and text chat and "o3-mini" is for text, PDF and voice chat only.
386
- ### Reasoning Effort:
387
- The reasoning effort controls how complex or detailed the assistant's answers should be.
388
- - **Low**: Provides quick, concise answers with minimal reasoning or details.
389
- - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
390
- - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
391
- """)
392
-
393
- # Store PDF content as a state variable
394
- pdf_content = gr.State("")
395
-
396
- with gr.Row():
397
- openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
398
-
399
- # Input type selector
400
- with gr.Row():
401
- input_type = gr.Radio(
402
- ["Text", "Image", "Voice", "PDF", "PDF(QUIZ)"],
403
- label="Choose Input Type",
404
- value="Text"
405
- )
406
-
407
- # Create the input components (initially text is visible, others are hidden)
408
- with gr.Row():
409
- # Text input
410
- input_text = gr.Textbox(
411
- label="Enter Text Question",
412
- placeholder="Ask a question or provide text",
413
- lines=2,
414
- visible=True
415
- )
416
-
417
- # Image input
418
- image_input = gr.Image(
419
- label="Upload an Image",
420
- type="pil",
421
- visible=False
422
- )
423
-
424
- # Audio input
425
- audio_input = gr.Audio(
426
- label="Upload or Record Audio",
427
- type="filepath",
428
- visible=False
429
- )
430
-
431
- # PDF input
432
- pdf_input = gr.File(
433
- label="Upload your PDF",
434
- file_types=[".pdf"],
435
- visible=False
436
- )
437
-
438
- # Quiz specific components
439
- quiz_questions_slider = gr.Slider(
440
- minimum=1,
441
- maximum=20,
442
- value=5,
443
- step=1,
444
- label="Number of Quiz Questions",
445
- visible=False
446
- )
447
-
448
- # Hidden state for quiz mode
449
- quiz_mode = gr.Checkbox(
450
- label="Quiz Mode",
451
- visible=False,
452
- value=False
453
- )
454
-
455
- with gr.Row():
456
- reasoning_effort = gr.Dropdown(
457
- label="Reasoning Effort",
458
- choices=["low", "medium", "high"],
459
- value="medium"
460
- )
461
- model_choice = gr.Dropdown(
462
- label="Select Model",
463
- choices=["o1", "o3-mini"],
464
- value="o1" # Default to 'o1' for image-related tasks
465
- )
466
- submit_btn = gr.Button("Ask!", elem_id="submit-btn")
467
- clear_btn = gr.Button("Clear History", elem_id="clear-history")
468
-
469
- chat_history = gr.Chatbot()
470
-
471
- # Connect the input type selector to the update function
472
- input_type.change(
473
- fn=update_input_type,
474
- inputs=[input_type],
475
- outputs=[input_text, image_input, audio_input, pdf_input, quiz_questions_slider, quiz_mode]
476
- )
477
-
478
- # Process PDF when uploaded
479
- pdf_input.change(
480
- fn=process_pdf,
481
- inputs=[pdf_input],
482
- outputs=[pdf_content]
483
- )
484
-
485
- # Button interactions
486
- submit_btn.click(
487
- fn=chatbot,
488
- inputs=[
489
- input_text,
490
- image_input,
491
- audio_input,
492
- pdf_input,
493
- openai_api_key,
494
- reasoning_effort,
495
- model_choice,
496
- pdf_content,
497
- quiz_questions_slider,
498
- quiz_mode,
499
- chat_history
500
- ],
501
- outputs=[
502
- input_text,
503
- image_input,
504
- audio_input,
505
- pdf_input,
506
- pdf_content,
507
- chat_history
508
- ]
509
- )
510
-
511
- clear_btn.click(
512
- fn=clear_history,
513
- inputs=[],
514
- outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
515
- )
516
-
517
- return demo
518
-
519
- # Run the interface
520
- if __name__ == "__main__":
521
- demo = create_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
  demo.launch()
 
1
+ import gradio as gr
2
+ from openai import OpenAI
3
+ import base64
4
+ from PIL import Image
5
+ import io
6
+ import os
7
+ import tempfile
8
+ import fitz # PyMuPDF for PDF handling
9
+
10
+ # Function to extract text from PDF files
11
+ def extract_text_from_pdf(pdf_file):
12
+ try:
13
+ text = ""
14
+ pdf_document = fitz.open(pdf_file)
15
+
16
+ for page_num in range(len(pdf_document)):
17
+ page = pdf_document[page_num]
18
+ text += page.get_text()
19
+
20
+ pdf_document.close()
21
+ return text
22
+ except Exception as e:
23
+ return f"Error extracting text from PDF: {str(e)}"
24
+
25
+ # Function to generate MCQ quiz from PDF content
26
+ def generate_mcq_quiz(pdf_content, num_questions, openai_api_key, model_choice):
27
+ if not openai_api_key:
28
+ return "Error: No API key provided."
29
+
30
+ client = OpenAI(api_key=openai_api_key)
31
+
32
+ # Limit content length to avoid token limits
33
+ limited_content = pdf_content[:8000] if len(pdf_content) > 8000 else pdf_content
34
+
35
+ prompt = f"""Based on the following document content, generate {num_questions} multiple-choice quiz questions.
36
+ For each question:
37
+ 1. Create a clear question based on key concepts in the document
38
+ 2. Provide 4 possible answers (A, B, C, D)
39
+ 3. Indicate the correct answer
40
+ 4. Briefly explain why the answer is correct
41
+
42
+ Format the output clearly with each question numbered and separated.
43
+
44
+ Document content:
45
+ {limited_content}
46
+ """
47
+
48
+ try:
49
+ messages = [
50
+ {"role": "user", "content": prompt}
51
+ ]
52
+
53
+ # Use appropriate model based on choice
54
+ model_name = "gpt-4" if model_choice == "o1" else "gpt-3.5-turbo"
55
+
56
+ response = client.chat.completions.create(
57
+ model=model_name,
58
+ messages=messages,
59
+ max_tokens=2000
60
+ )
61
+
62
+ return response.choices[0].message.content
63
+ except Exception as e:
64
+ return f"Error generating quiz: {str(e)}"
65
+
66
+ # Function to send the request to OpenAI API with an image, text or PDF input
67
+ def generate_response(input_text, image, pdf_content, openai_api_key, reasoning_effort="medium", model_choice="o1"):
68
+ if not openai_api_key:
69
+ return "Error: No API key provided."
70
+
71
+ client = OpenAI(api_key=openai_api_key)
72
+
73
+ # Process the input depending on whether it's text, image, or a PDF-related query
74
+ if pdf_content and input_text:
75
+ # For PDF queries, we combine the PDF content with the user's question
76
+ prompt = f"Based on the following document content, please answer this question: '{input_text}'\n\nDocument content:\n{pdf_content}"
77
+ messages = [{"role": "user", "content": prompt}]
78
+ elif image:
79
+ # Convert the image to base64 string
80
+ image_base64 = get_base64_string_from_image(image)
81
+ messages = [
82
+ {
83
+ "role": "user",
84
+ "content": [
85
+ {"type": "text", "text": input_text or "Please describe this image."},
86
+ {
87
+ "type": "image_url",
88
+ "image_url": {
89
+ "url": f"data:image/png;base64,{image_base64}"
90
+ }
91
+ }
92
+ ]
93
+ }
94
+ ]
95
+ else:
96
+ # Plain text input
97
+ messages = [{"role": "user", "content": input_text}]
98
+
99
+ try:
100
+ # Use appropriate model based on choice
101
+ if model_choice == "o1" and image:
102
+ model_name = "gpt-4-vision-preview"
103
+ elif model_choice == "o1":
104
+ model_name = "gpt-4"
105
+ else:
106
+ model_name = "gpt-3.5-turbo"
107
+
108
+ # Call OpenAI API with the selected model
109
+ response = client.chat.completions.create(
110
+ model=model_name,
111
+ messages=messages,
112
+ max_tokens=2000
113
+ )
114
+
115
+ return response.choices[0].message.content
116
+ except Exception as e:
117
+ return f"Error calling OpenAI API: {str(e)}"
118
+
119
+ # Function to convert an uploaded image to a base64 string
120
+ def get_base64_string_from_image(pil_image):
121
+ # Convert PIL Image to bytes
122
+ buffered = io.BytesIO()
123
+ pil_image.save(buffered, format="PNG")
124
+ img_bytes = buffered.getvalue()
125
+ base64_str = base64.b64encode(img_bytes).decode("utf-8")
126
+ return base64_str
127
+
128
+ # Function to transcribe audio to text using OpenAI Whisper API
129
+ def transcribe_audio(audio, openai_api_key):
130
+ if not openai_api_key:
131
+ return "Error: No API key provided."
132
+
133
+ client = OpenAI(api_key=openai_api_key)
134
+
135
+ try:
136
+ # Open the audio file and pass it as a file object
137
+ with open(audio, 'rb') as audio_file:
138
+ # Transcribe the audio to text using OpenAI's whisper model
139
+ transcript = client.audio.transcriptions.create(
140
+ model="whisper-1",
141
+ file=audio_file
142
+ )
143
+ return transcript.text
144
+ except Exception as e:
145
+ return f"Error transcribing audio: {str(e)}"
146
+
147
+ # The function that will be used by Gradio interface
148
+ def chatbot(input_text, image, audio, pdf_file, openai_api_key, reasoning_effort, model_choice, pdf_content, num_quiz_questions, pdf_quiz_mode, history):
149
+ if history is None:
150
+ history = []
151
+
152
+ # If there's audio, transcribe it to text
153
+ if audio:
154
+ input_text = transcribe_audio(audio, openai_api_key)
155
+
156
+ # If a new PDF is uploaded, extract its text
157
+ new_pdf_content = pdf_content
158
+ if pdf_file is not None:
159
+ new_pdf_content = extract_text_from_pdf(pdf_file)
160
+
161
+ # Check if we're in PDF quiz mode
162
+ if pdf_quiz_mode:
163
+ if new_pdf_content:
164
+ # Generate MCQ quiz questions
165
+ quiz_response = generate_mcq_quiz(new_pdf_content, int(num_quiz_questions), openai_api_key, model_choice)
166
+ history.append((f"👤: [Uploaded PDF for Quiz - {int(num_quiz_questions)} questions]", f"🤖: {quiz_response}"))
167
+ else:
168
+ history.append(("👤: [Attempted to generate quiz without PDF]", "🤖: Please upload a PDF file to generate quiz questions."))
169
+ else:
170
+ # Regular chat mode - generate the response
171
+ response = generate_response(input_text, image, new_pdf_content, openai_api_key, reasoning_effort, model_choice)
172
+
173
+ # Append the response to the history
174
+ if input_text:
175
+ history.append((f"👤: {input_text}", f"🤖: {response}"))
176
+ elif image is not None:
177
+ history.append((f"👤: [Uploaded image]", f"🤖: {response}"))
178
+ elif pdf_file is not None:
179
+ history.append((f"👤: [Uploaded PDF]", f"🤖: {response}"))
180
+ else:
181
+ history.append((f"👤: [No input provided]", f"🤖: Please provide some input (text, image, or PDF) for me to respond to."))
182
+
183
+ return "", None, None, None, new_pdf_content, history
184
+
185
+ # Function to clear the chat history and PDF content
186
+ def clear_history():
187
+ return "", None, None, None, "", []
188
+
189
+ # Function to process a newly uploaded PDF
190
+ def process_pdf(pdf_file):
191
+ if pdf_file is None:
192
+ return ""
193
+ return extract_text_from_pdf(pdf_file)
194
+
195
+ # Function to update visible components based on input type selection
196
+ def update_input_type(choice):
197
+ if choice == "Text":
198
+ return (
199
+ gr.update(visible=True),
200
+ gr.update(visible=False),
201
+ gr.update(visible=False),
202
+ gr.update(visible=False),
203
+ gr.update(visible=False),
204
+ False
205
+ )
206
+ elif choice == "Image":
207
+ return (
208
+ gr.update(visible=True),
209
+ gr.update(visible=True),
210
+ gr.update(visible=False),
211
+ gr.update(visible=False),
212
+ gr.update(visible=False),
213
+ False
214
+ )
215
+ elif choice == "Voice":
216
+ return (
217
+ gr.update(visible=False),
218
+ gr.update(visible=False),
219
+ gr.update(visible=True),
220
+ gr.update(visible=False),
221
+ gr.update(visible=False),
222
+ False
223
+ )
224
+ elif choice == "PDF":
225
+ return (
226
+ gr.update(visible=True),
227
+ gr.update(visible=False),
228
+ gr.update(visible=False),
229
+ gr.update(visible=True),
230
+ gr.update(visible=False),
231
+ False
232
+ )
233
+ elif choice == "PDF(QUIZ)":
234
+ return (
235
+ gr.update(visible=False),
236
+ gr.update(visible=False),
237
+ gr.update(visible=False),
238
+ gr.update(visible=True),
239
+ gr.update(visible=True),
240
+ True
241
+ )
242
+
243
+ # Custom CSS styles with animations and button colors
244
+ custom_css = """
245
+ /* General body styles */
246
+ .gradio-container {
247
+ font-family: 'Arial', sans-serif;
248
+ background-color: #f0f4f8; /* Lighter blue-gray background */
249
+ color: #2d3748;;
250
+ }
251
+ /* Header styles */
252
+ .gradio-header {
253
+ background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
254
+ color: white;
255
+ padding: 20px;
256
+ text-align: center;
257
+ border-radius: 8px;
258
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
259
+ animation: fadeIn 1s ease-out;
260
+ }
261
+ .gradio-header h1 {
262
+ font-size: 2.5rem;
263
+ }
264
+ .gradio-header h3 {
265
+ font-size: 1.2rem;
266
+ margin-top: 10px;
267
+ }
268
+ /* Chatbot container styles */
269
+ .gradio-chatbot {
270
+ background-color: #fff;
271
+ border-radius: 10px;
272
+ padding: 20px;
273
+ box-shadow: 0 6px 18px rgba(0, 0, 0, 0.1);
274
+ border-left: 4px solid #4a00e0; /* Accent border */
275
+ }
276
+ /* Input field styles */
277
+ .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file, .gradio-slider {
278
+ border-radius: 8px;
279
+ border: 2px solid #e2e8f0;
280
+ background-color: #f8fafc;
281
+ }
282
+ .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-image:focus, .gradio-audio:focus, .gradio-file:focus, .gradio-slider:focus {
283
+ border-color: #8e2de2;
284
+ box-shadow: 0 0 0 3px rgba(142, 45, 226, 0.2);
285
+ }
286
+ /* Button styles */
287
+ /* Send Button: Sky Blue */
288
+ #submit-btn {
289
+ background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
290
+ color: white;
291
+ border: none;
292
+ border-radius: 8px;
293
+ padding: 10px 19px;
294
+ font-size: 1.1rem;
295
+ cursor: pointer;
296
+ transition: all 0.3s ease;
297
+ margin-left: auto;
298
+ margin-right: auto;
299
+ display: block;
300
+ margin-top: 10px;
301
+ }
302
+ #submit-btn:hover {
303
+ background: linear-gradient(135deg, #5b10f1 0%, #9f3ef3 100%); /* Slightly lighter */
304
+ box-shadow: 0 6px 8px rgba(74, 0, 224, 0.4);
305
+ }
306
+ #submit-btn:active {
307
+ transform: scale(0.95);
308
+ }
309
+ #clear-history {
310
+ background: linear-gradient(135deg, #e53e3e 0%, #f56565 100%); /* Red gradient */
311
+ color: white;
312
+ border: none;
313
+ border-radius: 8px;
314
+ padding: 10px 13px;
315
+ font-size: 1.1rem;
316
+ cursor: pointer;
317
+ transition: all 0.3s ease;
318
+ margin-top: 10px;
319
+ }
320
+ #clear-history:hover {
321
+ background: linear-gradient(135deg, #c53030 0%, #e53e3e 100%); /* Slightly darker red gradient on hover */
322
+ box-shadow: 0 6px 8px rgba(229, 62, 62, 0.4);
323
+ }
324
+ #clear-history:active {
325
+ transform: scale(0.95);
326
+ }
327
+ /* Input type selector buttons */
328
+ #input-type-group {
329
+ display: flex;
330
+ justify-content: center;
331
+ gap: 10px;
332
+ margin-bottom: 20px;
333
+ }
334
+ .input-type-btn {
335
+ background-color: #718096; /* Slate gray */
336
+ color: white;
337
+ border: none;
338
+ border-radius: 8px;
339
+ padding: 10px 15px;
340
+ font-size: 1rem;
341
+ cursor: pointer;
342
+ transition: all 0.3s ease;
343
+ }
344
+ .input-type-btn.selected {
345
+ background-color: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
346
+ }
347
+ .input-type-btn:hover {
348
+ background-color: #4a5568; /* Darker slate */
349
+ }
350
+ /* Chat history styles */
351
+ .gradio-chatbot .message {
352
+ margin-bottom: 10px;
353
+ }
354
+ .gradio-chatbot .user {
355
+ background-color: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */
356
+ color: white;
357
+ padding: 10px;
358
+ border-radius: 12px;
359
+ max-width: 70%;
360
+ animation: slideInUser 0.5s ease-out;
361
+ }
362
+ .gradio-chatbot .assistant {
363
+ background-color: #f0f4f8; /* Light blue-gray */
364
+ color: #2d3748;
365
+ padding: 10px;
366
+ border-radius: 12px;
367
+ max-width: 70%;
368
+ margin-left: auto;
369
+ animation: slideInAssistant 0.5s ease-out;
370
+ }
371
+ /* Animation keyframes */
372
+ @keyframes fadeIn {
373
+ 0% { opacity: 0; }
374
+ 100% { opacity: 1; }
375
+ }
376
+ @keyframes slideInUser {
377
+ 0% { transform: translateX(-100%); }
378
+ 100% { transform: translateX(0); }
379
+ }
380
+ @keyframes slideInAssistant {
381
+ 0% { transform: translateX(100%); }
382
+ 100% { transform: translateX(0); }
383
+ }
384
+ /* Mobile responsiveness */
385
+ @media (max-width: 768px) {
386
+ .gradio-header h1 {
387
+ font-size: 1.8rem;
388
+ }
389
+ .gradio-header h3 {
390
+ font-size: 1rem;
391
+ }
392
+ .gradio-chatbot {
393
+ max-height: 400px;
394
+ }
395
+ .gradio-textbox, .gradio-dropdown, .gradio-image, .gradio-audio, .gradio-file, .gradio-slider {
396
+ width: 100%;
397
+ }
398
+ #submit-btn, #clear-history {
399
+ width: 100%;
400
+ margin-left: 0;
401
+ }
402
+ }
403
+ """
404
+
405
+ # Gradio interface setup
406
+ def create_interface():
407
+ with gr.Blocks(css=custom_css) as demo:
408
+ gr.Markdown("""
409
+ <div class="gradio-header">
410
+ <h1>Multimodal Chatbot (Text + Image + Voice + PDF + Quiz)</h1>
411
+ <h3>Interact with a chatbot using text, image, voice, or PDF inputs</h3>
412
+ </div>
413
+ """)
414
+
415
+ # Add a description with an expandable accordion
416
+ with gr.Accordion("Click to expand for details", open=False):
417
+ gr.Markdown("""
418
+ ### Description:
419
+ This is a multimodal chatbot that can handle text, image, voice, PDF inputs, and generate quizzes from PDFs.
420
+ - You can ask questions or provide text, and the assistant will respond.
421
+ - You can upload an image, and the assistant will process it and answer questions about the image.
422
+ - Voice input is supported: You can upload or record an audio file, and it will be transcribed to text and sent to the assistant.
423
+ - PDF support: Upload a PDF and ask questions about its content.
424
+ - PDF Quiz: Upload a PDF and specify how many MCQ questions you want generated based on the content.
425
+ - Enter your OpenAI API key to start interacting with the model.
426
+ - You can use the 'Clear History' button to remove the conversation history.
427
+ - "o1" is for image, voice, PDF and text chat and "o3-mini" is for text, PDF and voice chat only.
428
+ ### Reasoning Effort:
429
+ The reasoning effort controls how complex or detailed the assistant's answers should be.
430
+ - **Low**: Provides quick, concise answers with minimal reasoning or details.
431
+ - **Medium**: Offers a balanced response with a reasonable level of detail and thought.
432
+ - **High**: Produces more detailed, analytical, or thoughtful responses, requiring deeper reasoning.
433
+ """)
434
+
435
+ # Store PDF content as a state variable
436
+ pdf_content = gr.State("")
437
+
438
+ with gr.Row():
439
+ openai_api_key = gr.Textbox(label="Enter OpenAI API Key", type="password", placeholder="sk-...", interactive=True)
440
+
441
+ # Input type selector
442
+ with gr.Row():
443
+ input_type = gr.Radio(
444
+ ["Text", "Image", "Voice", "PDF", "PDF(QUIZ)"],
445
+ label="Choose Input Type",
446
+ value="Text"
447
+ )
448
+
449
+ # Create the input components (initially text is visible, others are hidden)
450
+ with gr.Row():
451
+ # Text input
452
+ input_text = gr.Textbox(
453
+ label="Enter Text Question",
454
+ placeholder="Ask a question or provide text",
455
+ lines=2,
456
+ visible=True
457
+ )
458
+
459
+ # Image input
460
+ image_input = gr.Image(
461
+ label="Upload an Image",
462
+ type="pil",
463
+ visible=False
464
+ )
465
+
466
+ # Audio input
467
+ audio_input = gr.Audio(
468
+ label="Upload or Record Audio",
469
+ type="filepath",
470
+ visible=False
471
+ )
472
+
473
+ # PDF input
474
+ pdf_input = gr.File(
475
+ label="Upload your PDF",
476
+ file_types=[".pdf"],
477
+ visible=False
478
+ )
479
+
480
+ # Quiz specific components
481
+ quiz_questions_slider = gr.Slider(
482
+ minimum=1,
483
+ maximum=20,
484
+ value=5,
485
+ step=1,
486
+ label="Number of Quiz Questions",
487
+ visible=False
488
+ )
489
+
490
+ # State variable for quiz mode (not visible)
491
+ quiz_mode = gr.State(False)
492
+
493
+ with gr.Row():
494
+ reasoning_effort = gr.Dropdown(
495
+ label="Reasoning Effort",
496
+ choices=["low", "medium", "high"],
497
+ value="medium"
498
+ )
499
+ model_choice = gr.Dropdown(
500
+ label="Select Model",
501
+ choices=["o1", "o3-mini"],
502
+ value="o1" # Default to 'o1' for image-related tasks
503
+ )
504
+ submit_btn = gr.Button("Ask!", elem_id="submit-btn")
505
+ clear_btn = gr.Button("Clear History", elem_id="clear-history")
506
+
507
+ chat_history = gr.Chatbot()
508
+
509
+ # Connect the input type selector to the update function
510
+ input_type.change(
511
+ fn=update_input_type,
512
+ inputs=[input_type],
513
+ outputs=[input_text, image_input, audio_input, pdf_input, quiz_questions_slider, quiz_mode]
514
+ )
515
+
516
+ # Process PDF when uploaded
517
+ pdf_input.change(
518
+ fn=process_pdf,
519
+ inputs=[pdf_input],
520
+ outputs=[pdf_content]
521
+ )
522
+
523
+ # Button interactions
524
+ submit_btn.click(
525
+ fn=chatbot,
526
+ inputs=[
527
+ input_text,
528
+ image_input,
529
+ audio_input,
530
+ pdf_input,
531
+ openai_api_key,
532
+ reasoning_effort,
533
+ model_choice,
534
+ pdf_content,
535
+ quiz_questions_slider,
536
+ quiz_mode,
537
+ chat_history
538
+ ],
539
+ outputs=[
540
+ input_text,
541
+ image_input,
542
+ audio_input,
543
+ pdf_input,
544
+ pdf_content,
545
+ chat_history
546
+ ]
547
+ )
548
+
549
+ clear_btn.click(
550
+ fn=clear_history,
551
+ inputs=[],
552
+ outputs=[input_text, image_input, audio_input, pdf_input, pdf_content, chat_history]
553
+ )
554
+
555
+ return demo
556
+
557
+ # Run the interface
558
+ if __name__ == "__main__":
559
+ demo = create_interface()
560
  demo.launch()