Omarrran commited on
Commit
8c65552
·
verified ·
1 Parent(s): 26cbdf6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -282
app.py CHANGED
@@ -9,15 +9,15 @@ import logging
9
  from langchain.document_loaders import OnlinePDFLoader # for loading the pdf
10
  from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
11
  from langchain.text_splitter import CharacterTextSplitter
12
- from langchain.vectorstores import Chroma # for the vectorization part
13
- from langchain.chains import RetrievalQA # for conversing with ChatGPT
14
- from langchain.chat_models import ChatOpenAI # the LLM model we'll use (ChatGPT)
15
- from langchain_core.prompts import PromptTemplate # updated import per warning
16
 
17
  # Setup basic logging
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
- log_messages = "" # Global log collector
21
 
22
  def update_log(message):
23
  global log_messages
@@ -27,8 +27,8 @@ def update_log(message):
27
  def ocr_converter(input_file):
28
  image_pdf = input_file.name
29
  try:
30
- # Specify output_type="pdf" to bypass Ghostscript issues.
31
- ocrmypdf.ocr(image_pdf, image_pdf, redo_ocr=True, language="eng", output_type="pdf")
32
  update_log(f"OCR conversion successful for {image_pdf}")
33
  except Exception as e:
34
  error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
@@ -40,50 +40,31 @@ def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
40
  try:
41
  if open_ai_key is not None:
42
  os.environ['OPENAI_API_KEY'] = open_ai_key
43
- # Perform OCR conversion; errors here will be logged.
44
  pdf_doc = ocr_converter(pdf_doc)
45
- # Load the PDF file
46
  loader = OnlinePDFLoader(pdf_doc)
47
  pages = loader.load_and_split()
48
  update_log(f"Loaded {len(pages)} pages from {pdf_doc}")
49
 
50
- # Use HuggingFaceEmbeddings (open source) for generating embeddings.
51
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
52
  pages_to_be_loaded = []
53
-
54
  if relevant_pages:
55
- page_numbers = relevant_pages.split(",")
56
- for page_number in page_numbers:
57
- if page_number.strip().isdigit():
58
- pageIndex = int(page_number.strip()) - 1
59
  if 0 <= pageIndex < len(pages):
60
  pages_to_be_loaded.append(pages[pageIndex])
61
-
62
  if not pages_to_be_loaded:
63
  pages_to_be_loaded = pages.copy()
64
  update_log("No specific pages selected; using entire PDF.")
65
 
66
- # Create a vector store using Chroma with the embeddings.
67
  vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
68
 
69
- # Configure the prompt template for the QA chain.
70
  prompt_template = (
71
- """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A.
72
- If you encounter a date, return it in mm/dd/yyyy format. If there is a Preface section in the document, extract the chapter# and the short description from the Preface.
73
- Chapter numbers are listed to the left in Preface and always start with an alphabet, for example A1-1.
74
  {context}
75
  Question: {question}
76
- Return the answer. Provide the answer in the JSON format and extract the key from the question. Where applicable, break the answer into bullet points.
77
- When the sentences are long, try and break them into sub sections and include all the information and do not skip any information.
78
- If there is an exception to the answer, please do include it in a 'Note:' section. If there are no exceptions to the answer, please skip the 'Note:' section.
79
- Include a 'For additional details refer to' section when the document has more information to offer on the topic being questioned.
80
- If the document has a Preface or 'Table of Contents' section, extract the chapter# and a short description and include the info under the 'For additional details refer to' section.
81
- List only the chapters that contain information or skip this section altogether. Do not use page numbers as chapter numbers as they are different.
82
- If additional information is found in multiple pages within the same chapter, list the chapter only once.
83
- If chapter information cannot be extracted, include any other information that will help the user navigate to the relevant sections of the document.
84
- If the document does not contain a Preface or 'Table of Contents' section, please do not call that out."""
85
  )
86
-
87
  PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
88
  chain_type_kwargs = {"prompt": PROMPT}
89
 
@@ -124,120 +105,70 @@ def create_sqlite_table(connection):
124
  def load_master_questionset_into_sqlite(connection):
125
  create_sqlite_table(connection)
126
  cursor = connection.cursor()
127
- masterlist_for_DOT_count = cursor.execute(
128
  "SELECT COUNT(document_type) FROM questions WHERE document_type=? AND questionset_tag=?",
129
- ("DOT", "masterlist",)
130
  ).fetchone()[0]
131
- if masterlist_for_DOT_count == 0:
132
- update_log("Loading DOT masterlist into DB.")
133
- fieldListForDOT, queryListForDOT = create_field_and_question_list_for_DOT()
134
- fieldListForTransmittalSummary, queryListForTransmittalSummary = create_field_and_question_list_for_Transmittal_Summary()
135
- for i in range(len(queryListForDOT)):
136
  cursor.execute(
137
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
138
- ["DOT", "masterlist", fieldListForDOT[i], queryListForDOT[i]]
139
  )
140
- for i in range(len(queryListForTransmittalSummary)):
 
141
  cursor.execute(
142
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
143
- ["Transmittal Summary", "masterlist", fieldListForTransmittalSummary[i], queryListForTransmittalSummary[i]]
144
  )
145
  connection.commit()
146
  total_questions = cursor.execute("SELECT COUNT(document_type) FROM questions").fetchone()[0]
147
  update_log(f"Total questions in DB: {total_questions}")
148
 
149
- def create_field_and_question_list_for_DOT():
150
- queryList = [
151
- "what is the Loan Number?",
152
- "Who is the Borrower?",
153
- "what is the Case Number?",
154
- "what is the Mortgage Identification number?",
155
- "DOT signed date?",
156
- "Who is the Lender?",
157
- "what is the VA/FHA Number?",
158
- "Who is the Co-Borrower?",
159
- "What is the property type - single family, multi family?",
160
- "what is the Property Address?",
161
- "In what County is the property located?",
162
- "what is the Electronically recorded date"
163
- ]
164
- fieldList = [
165
- "Loan Number",
166
- "Borrower",
167
- "Case Number",
168
- "MIN Number",
169
- "Signed Date",
170
- "Lender",
171
- "VA/FHA Number",
172
- "Co-Borrower",
173
- "Property Type",
174
- "Property Address",
175
- "Property County",
176
- "Electronic Recording Date"
177
- ]
178
- return fieldList, queryList
179
 
180
- def create_field_and_question_list_for_Transmittal_Summary():
181
- queryList = [
182
- "Who is the Borrower?",
183
- "what is the Property Address?",
184
- "what is the Loan Term?",
185
- "What is the Base Income?",
186
- "what is the Borrower's SSN?",
187
- "Who is the Co-Borrower?",
188
- "What is the Original Loan Amount?",
189
- "What is the Initial P&I payment?",
190
- "What is the Co-Borrower's SSN?",
191
- "Number of units?",
192
- "Who is the Seller?",
193
- "Document signed date?"
194
- ]
195
- fieldList = [
196
- "Borrower",
197
- "Property Address",
198
- "Loan Term",
199
- "Base Income",
200
- "Borrower's SSN",
201
- "Co-Borrower",
202
- "Original Loan Amount",
203
- "Initial P&I payment",
204
- "Co-Borrower’s SSN",
205
- "Units#",
206
- "Seller",
207
- "Signed Date"
208
- ]
209
- return fieldList, queryList
210
 
211
  def retrieve_document_type_and_questionsettag_from_sqlite():
212
  connection = create_db_connection()
213
  load_master_questionset_into_sqlite(connection)
214
  cursor = connection.cursor()
215
  rows = cursor.execute("SELECT document_type, questionset_tag FROM questions ORDER BY document_type, UPPER(questionset_tag)").fetchall()
216
- list_for_dropdown = []
217
- for i in rows:
218
- concatenated_value = f"{i[0]}:{i[1]}"
219
- if concatenated_value not in list_for_dropdown:
220
- list_for_dropdown.append(concatenated_value)
221
- update_log(f"Found question set: {concatenated_value}")
222
  connection.close()
223
- return gr.Dropdown.update(choices=list_for_dropdown, value=list_for_dropdown[0])
224
 
225
  def retrieve_fields_and_questions(dropdownoption):
226
  splitwords = dropdownoption.split(":")
227
  connection = create_db_connection()
228
  cursor = connection.cursor()
229
- fields_and_questions = cursor.execute(
230
  "SELECT document_type, field, question FROM questions WHERE document_type=? AND questionset_tag=?",
231
  (splitwords[0], splitwords[1],)
232
  ).fetchall()
233
  connection.close()
234
- return pd.DataFrame(fields_and_questions, columns=["documentType", "field", "question"])
235
 
236
  def add_questionset(data, document_type, tag_for_questionset):
237
  connection = create_db_connection()
238
  create_sqlite_table(connection)
239
  cursor = connection.cursor()
240
- for index, row in data.iterrows():
241
  cursor.execute(
242
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
243
  [document_type, tag_for_questionset, row['field'], row['question']]
@@ -249,12 +180,12 @@ def load_csv_and_store_questionset_into_sqlite(csv_file, document_type, tag_for_
249
  if tag_for_questionset and document_type:
250
  data = pd.read_csv(csv_file.name)
251
  add_questionset(data, document_type, tag_for_questionset)
252
- responseString = f"Task Complete. Uploaded {data.shape[0]} fields and corresponding questions for {document_type}:{tag_for_questionset}"
253
- update_log(responseString)
254
- return responseString
255
  else:
256
- return "Please select the Document Type and provide a name for the Question Set"
257
-
258
  def answer_predefined_questions(document_type_and_questionset):
259
  splitwords = document_type_and_questionset.split(":")
260
  document_type = splitwords[0]
@@ -264,92 +195,95 @@ def answer_predefined_questions(document_type_and_questionset):
264
  cursor = connection.cursor()
265
  rows = cursor.execute(
266
  "SELECT field, question FROM questions WHERE document_type=? AND questionset_tag=?",
267
- (document_type, question_set,)
268
  ).fetchall()
269
  connection.close()
270
- for entry in rows:
271
- fields.append(entry[0])
272
- questions.append(entry[1])
273
- # Call pdf_qa.run only if pdf_qa is defined
274
  try:
275
- responses.append(pdf_qa.run(entry[1]))
276
  except Exception as e:
277
- error_str = f"Error in pdf_qa.run for question '{entry[1]}': {str(e)}"
278
- update_log(error_str)
279
- responses.append(error_str)
280
- return pd.DataFrame({"Field": fields, "Question to gpt-4": questions, "Response from gpt-4": responses})
281
 
282
  def summarize_contents():
283
- question = "Generate a short summary of the contents along with no more than 3 leading/example questions. Do not return the response in json format"
 
 
284
  try:
285
  response = pdf_qa.run(question)
286
  update_log("Summarization successful.")
287
  return response
288
  except Exception as e:
289
- error_str = f"Error in summarization: {str(e)}"
290
- update_log(error_str)
291
- return error_str
292
 
293
  def answer_query(query):
 
 
294
  try:
295
  response = pdf_qa.run(query)
296
  update_log(f"Query answered: {query}")
297
  return response
298
  except Exception as e:
299
- error_str = f"Error in answering query: {str(e)}"
300
- update_log(error_str)
301
- return error_str
302
 
303
  def get_log():
304
  return log_messages
305
 
306
- # Define CSS and title HTML
307
  css = """
308
- #col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
309
  """
310
 
311
  title = """
312
- <div style="text-align: center; max-width: 700px;">
313
  <h1>AskMoli - Chatbot for PDFs</h1>
314
- <p>Upload a .PDF and click "Upload PDF and generate embeddings". Wait for the status to show "Ready". Then either choose a pre-defined question set or ask your own question. The app uses GPT-4 with a custom prompt template.</p>
315
  </div>
316
  """
317
 
318
  # Build the Gradio interface
319
  with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
320
- with gr.Column(elem_id="col-container"):
321
  gr.HTML(title)
322
 
323
  with gr.Tab("Chatbot"):
324
  with gr.Column():
325
- open_ai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
326
  pdf_doc = gr.File(label="Load a PDF", file_types=['.pdf'], type='filepath')
327
- relevant_pages = gr.Textbox(label="*Optional - Comma separated page numbers (leave blank for entire PDF)")
328
 
329
  with gr.Row():
330
  status = gr.Textbox(label="Status", interactive=False)
331
- load_pdf_btn = gr.Button("Upload PDF and generate embeddings")
332
 
333
  with gr.Row():
334
  summary = gr.Textbox(label="Summary")
335
  summarize_pdf_btn = gr.Button("Summarize Contents")
336
 
337
  with gr.Row():
338
- input_query = gr.Textbox(label="Type your question")
339
  output_answer = gr.Textbox(label="Answer")
340
- submit_query_btn = gr.Button("Submit your question")
341
 
342
  with gr.Row():
343
  questionsets = gr.Dropdown(label="Pre-defined Question Sets", choices=[])
344
- load_questionsets_btn = gr.Button("Retrieve Question Sets")
345
- fields_and_questions = gr.Dataframe(label="Fields & Questions in the chosen set")
346
- load_fields_btn = gr.Button("Retrieve Questions for chosen set")
347
-
348
  with gr.Row():
349
- answers_df = gr.Dataframe(label="Answers to Pre-defined Question Set")
350
- answer_predefined_btn = gr.Button("Get answers for chosen question set")
351
 
352
- # Log window for error and info messages
353
  log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
354
 
355
  with gr.Tab("OCR Converter"):
@@ -361,138 +295,17 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
361
 
362
  with gr.Tab("Upload Question Set"):
363
  with gr.Column():
364
- document_types = [
365
- "Mortgage 1040 US Individual Tax Returns 8453 Elec Form",
366
- "Mortgage 1098",
367
- "Mortgage 1099",
368
- "Mortgage Abstract",
369
- "Mortgage ACH Authorization Form",
370
- "Mortgage Advance Fee Agreement",
371
- "Mortgage Affidavit",
372
- "Mortgage Affidavit of Suspense Funds",
373
- "Mortgage Agreement Documents",
374
- "Mortgage Sales Contract",
375
- "Mortgage Loan Estimate",
376
- "Mortgage Alimony Or Child Support",
377
- "Mortgage Amended Proof Of Claim",
378
- "Mortgage Amortization Schedule",
379
- "Mortgage Flood Insurance",
380
- "Mortgage Appraisal Report",
381
- "Mortgage Appraisal Disclosure",
382
- "Mortgage ARM Letter",
383
- "Mortgage Arms Length Affidavit",
384
- "Mortgage Assignment-Recorded",
385
- "Mortgage Assignment-Unrecorded",
386
- "Mortgage Assignment of Rent or Lease",
387
- "Mortgage Automated Value Model",
388
- "Mortgage Award Letters",
389
- "Mortgage Bailee Letter",
390
- "Mortgage Balloon Disclosure",
391
- "Mortgage Bank Statement",
392
- "Mortgage Bankruptcy Documents",
393
- "Mortgage Bill of Sale",
394
- "Mortgage Billing Statement",
395
- "Mortgage Birth-Marriage-Death Certificate",
396
- "Mortgage Borrower Certification Authorization",
397
- "Mortgage Borrower Response Package",
398
- "Mortgage Brokers Price Opinion",
399
- "Mortgage Business Plan",
400
- "Mortgage Buydown Agreement",
401
- "Mortgage Bylaws Covenants Conditions Restrictions",
402
- "Mortgage Cash for Keys",
403
- "Mortgage Certificate of Redemption",
404
- "Mortgage Certificate of Sale",
405
- "Mortgage Certificate of Title",
406
- "Mortgage Certification of Amount Due Payoff Reinstatement",
407
- "Mortgage Checks-Regular or Cashiers",
408
- "Mortgage Closing Disclosure",
409
- "Mortgage Closing Protection Letter",
410
- "Mortgage Closing Other",
411
- "Mortgage Code Violations",
412
- "Mortgage Request for Release",
413
- "Mortgage Certificate of Liability Insurance",
414
- "Mortgage Commitment Letter",
415
- "Mortgage Complaint",
416
- "Mortgage Complaint Answer Counter Claim",
417
- "Mortgage Conditional Approval Letter",
418
- "Mortgage Conditional Commitment",
419
- "Mortgage Consent Order",
420
- "Mortgage Consolidated Mortgage CEMA",
421
- "Mortgage Conveyance Claims",
422
- "Mortgage Correction and Revision Agreement",
423
- "Mortgage Correspondence",
424
- "Mortgage Court Order Settlement Divorce Decree",
425
- "Mortgage Credit Report",
426
- "Mortgage Customer Signature Authorization",
427
- "Mortgage Debt Validation",
428
- "Mortgage Deed",
429
- "Mortgage Default Notices",
430
- "Mortgage Direct Debit Authorization Form",
431
- "Mortgage Disclosure Documents",
432
- "Mortgage Document Checklist",
433
- "Mortgage Document Correction and Fee Due Agreement",
434
- "Mortgage Dodd Frank Certification",
435
- "Mortgage Drivers License",
436
- "Mortgage Request for VOE",
437
- "Mortgage Environmental Indemnity Agreement",
438
- "Mortgage Equal Credit Opportunity Act Notice",
439
- "Mortgage Escrow Agreement",
440
- "Mortgage Escrow Analysis Trial Balance Worksheet",
441
- "Mortgage Instructions to Escrow Agent",
442
- "Mortgage Escrow Letters",
443
- "Mortgage Executed Deeds",
444
- "Mortgage Fair Lending Notice",
445
- "Mortgage Foreclosure Complaint",
446
- "Mortgage Foreclosure Judgement",
447
- "Mortgage Foreclosure Sale",
448
- "Mortgage FHA Neighborhood Watch",
449
- "Mortgage Truth-In-Lending Disclosure Statement",
450
- "Mortgage Financial Form",
451
- "Mortgage Financing Agreement",
452
- "Mortgage First Payment Letter",
453
- "Mortgage Forced Place Insurance Documents",
454
- "Mortgage Foreclosure Documents",
455
- "Mortgage Good Faith Estimate",
456
- "Mortgage Guaranty",
457
- "Mortgage HAMP Certifications",
458
- "Mortgage HOA-Condo Covenants and Dues",
459
- "Mortgage Exemption Hold Harmless Letter",
460
- "Mortgage Home Equity Signature Verification Card",
461
- "Mortgage Home Inspection",
462
- "Mortgage Property Liability Insurance",
463
- "Mortgage Homeowners Insurance Notice",
464
- "Mortgage HUD-1 Settlement Statement",
465
- "Mortgage Income Other",
466
- "Mortgage Indemnity Agreement",
467
- "Mortgage Informed Consumer Choice Disclosure Notice",
468
- "Mortgage Initial Escrow Account Disclosure Statement",
469
- "Mortgage Invoices",
470
- "Mortgage Land Lease or Land Trust",
471
- "Mortgage Land Title Adjustment",
472
- "Mortgage Last Will and Testament",
473
- "Mortgage Legal Description",
474
- "Mortgage Letters Of Administration",
475
- "Mortgage Letters of Testamentary",
476
- "Mortgage Listing Agreement",
477
- "Mortgage Litigation Guarantee",
478
- "Mortgage DIL Closing",
479
- "Mortgage Hardship Letter",
480
- "Mortgage Hardship Affidavit",
481
- "Mortgage Home Affordable Modification Agreement",
482
- "Mortgage Profit And Loss",
483
- "Mortgage Earnest Money Promissory Note",
484
- "Mortgage Rental Agreement",
485
- "Mortgage Repayment Plan",
486
- "Mortgage Short Sale Miscellaneous"
487
- ]
488
- document_type_for_questionset = gr.Dropdown(choices=document_types, label="Select Document Type")
489
- tag_for_questionset = gr.Textbox(label="Name for Question Set (e.g., rwikd-dot-basic-questionset-20230707)")
490
- csv_file = gr.File(label="Load CSV (2 columns: field, question)", file_types=['.csv'], type='filepath')
491
-
492
  with gr.Row():
493
  status_for_csv = gr.Textbox(label="Status", interactive=False)
494
  load_csv_btn = gr.Button("Upload CSV into DB")
495
 
 
 
 
496
  # Set up button actions
497
  load_pdf_btn.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
498
  summarize_pdf_btn.click(summarize_contents, outputs=summary)
@@ -504,10 +317,6 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
504
 
505
  convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
506
  load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
507
-
508
- # Button to refresh the log window
509
- refresh_log_btn = gr.Button("Refresh Log")
510
- refresh_log_btn.click(get_log, outputs=log_window)
511
 
512
  # Launch the Gradio app
513
  demo.launch(debug=True)
 
9
  from langchain.document_loaders import OnlinePDFLoader # for loading the pdf
10
  from langchain.embeddings import HuggingFaceEmbeddings # open source embedding model
11
  from langchain.text_splitter import CharacterTextSplitter
12
+ from langchain.vectorstores import Chroma # for vectorization
13
+ from langchain.chains import RetrievalQA # for QA chain
14
+ from langchain.chat_models import ChatOpenAI # ChatGPT model
15
+ from langchain_core.prompts import PromptTemplate # prompt template import
16
 
17
  # Setup basic logging
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
+ log_messages = "" # global log collector
21
 
22
  def update_log(message):
23
  global log_messages
 
27
  def ocr_converter(input_file):
28
  image_pdf = input_file.name
29
  try:
30
+ # Use force_ocr=True and output_type="pdf" to work around Ghostscript issues.
31
+ ocrmypdf.ocr(image_pdf, image_pdf, redo_ocr=True, force_ocr=True, language="eng", output_type="pdf")
32
  update_log(f"OCR conversion successful for {image_pdf}")
33
  except Exception as e:
34
  error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
 
40
  try:
41
  if open_ai_key is not None:
42
  os.environ['OPENAI_API_KEY'] = open_ai_key
 
43
  pdf_doc = ocr_converter(pdf_doc)
 
44
  loader = OnlinePDFLoader(pdf_doc)
45
  pages = loader.load_and_split()
46
  update_log(f"Loaded {len(pages)} pages from {pdf_doc}")
47
 
 
48
  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
49
  pages_to_be_loaded = []
 
50
  if relevant_pages:
51
+ for page in relevant_pages.split(","):
52
+ if page.strip().isdigit():
53
+ pageIndex = int(page.strip()) - 1
 
54
  if 0 <= pageIndex < len(pages):
55
  pages_to_be_loaded.append(pages[pageIndex])
 
56
  if not pages_to_be_loaded:
57
  pages_to_be_loaded = pages.copy()
58
  update_log("No specific pages selected; using entire PDF.")
59
 
 
60
  vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
61
 
 
62
  prompt_template = (
63
+ """Use the following context to answer the question. If you do not know the answer, return N/A.
 
 
64
  {context}
65
  Question: {question}
66
+ Return the answer in JSON format."""
 
 
 
 
 
 
 
 
67
  )
 
68
  PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
69
  chain_type_kwargs = {"prompt": PROMPT}
70
 
 
105
  def load_master_questionset_into_sqlite(connection):
106
  create_sqlite_table(connection)
107
  cursor = connection.cursor()
108
+ masterlist_count = cursor.execute(
109
  "SELECT COUNT(document_type) FROM questions WHERE document_type=? AND questionset_tag=?",
110
+ ("DOC_A", "masterlist",)
111
  ).fetchone()[0]
112
+ if masterlist_count == 0:
113
+ update_log("Loading masterlist into DB.")
114
+ fields, queries = create_field_and_question_list_for_DOC_A()
115
+ for i in range(len(queries)):
 
116
  cursor.execute(
117
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
118
+ ["DOC_A", "masterlist", fields[i], queries[i]]
119
  )
120
+ fields2, queries2 = create_field_and_question_list_for_DOC_B()
121
+ for i in range(len(queries2)):
122
  cursor.execute(
123
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
124
+ ["DOC_B", "masterlist", fields2[i], queries2[i]]
125
  )
126
  connection.commit()
127
  total_questions = cursor.execute("SELECT COUNT(document_type) FROM questions").fetchone()[0]
128
  update_log(f"Total questions in DB: {total_questions}")
129
 
130
+ def create_field_and_question_list_for_DOC_A():
131
+ # Only two sample entries
132
+ fields = ["Loan Number", "Borrower"]
133
+ queries = ["What is the Loan Number?", "Who is the Borrower?"]
134
+ return fields, queries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ def create_field_and_question_list_for_DOC_B():
137
+ # Only two sample entries
138
+ fields = ["Property Address", "Signed Date"]
139
+ queries = ["What is the Property Address?", "What is the Signed Date?"]
140
+ return fields, queries
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  def retrieve_document_type_and_questionsettag_from_sqlite():
143
  connection = create_db_connection()
144
  load_master_questionset_into_sqlite(connection)
145
  cursor = connection.cursor()
146
  rows = cursor.execute("SELECT document_type, questionset_tag FROM questions ORDER BY document_type, UPPER(questionset_tag)").fetchall()
147
+ choices = []
148
+ for row in rows:
149
+ value = f"{row[0]}:{row[1]}"
150
+ if value not in choices:
151
+ choices.append(value)
152
+ update_log(f"Found question set: {value}")
153
  connection.close()
154
+ return gr.Dropdown.update(choices=choices, value=choices[0] if choices else "")
155
 
156
  def retrieve_fields_and_questions(dropdownoption):
157
  splitwords = dropdownoption.split(":")
158
  connection = create_db_connection()
159
  cursor = connection.cursor()
160
+ rows = cursor.execute(
161
  "SELECT document_type, field, question FROM questions WHERE document_type=? AND questionset_tag=?",
162
  (splitwords[0], splitwords[1],)
163
  ).fetchall()
164
  connection.close()
165
+ return pd.DataFrame(rows, columns=["documentType", "field", "question"])
166
 
167
  def add_questionset(data, document_type, tag_for_questionset):
168
  connection = create_db_connection()
169
  create_sqlite_table(connection)
170
  cursor = connection.cursor()
171
+ for _, row in data.iterrows():
172
  cursor.execute(
173
  "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
174
  [document_type, tag_for_questionset, row['field'], row['question']]
 
180
  if tag_for_questionset and document_type:
181
  data = pd.read_csv(csv_file.name)
182
  add_questionset(data, document_type, tag_for_questionset)
183
+ response = f"Uploaded {data.shape[0]} fields and questions for {document_type}:{tag_for_questionset}"
184
+ update_log(response)
185
+ return response
186
  else:
187
+ return "Please select a Document Type and provide a name for the Question Set"
188
+
189
  def answer_predefined_questions(document_type_and_questionset):
190
  splitwords = document_type_and_questionset.split(":")
191
  document_type = splitwords[0]
 
195
  cursor = connection.cursor()
196
  rows = cursor.execute(
197
  "SELECT field, question FROM questions WHERE document_type=? AND questionset_tag=?",
198
+ (document_type, question_set)
199
  ).fetchall()
200
  connection.close()
201
+ for field, question in rows:
202
+ fields.append(field)
203
+ questions.append(question)
 
204
  try:
205
+ responses.append(pdf_qa.run(question))
206
  except Exception as e:
207
+ err = f"Error: {str(e)}"
208
+ update_log(err)
209
+ responses.append(err)
210
+ return pd.DataFrame({"Field": fields, "Question": questions, "Response": responses})
211
 
212
  def summarize_contents():
213
+ question = "Generate a short summary of the contents along with up to 3 example questions."
214
+ if 'pdf_qa' not in globals():
215
+ return "Error: PDF embeddings not generated. Load a PDF first."
216
  try:
217
  response = pdf_qa.run(question)
218
  update_log("Summarization successful.")
219
  return response
220
  except Exception as e:
221
+ err = f"Error in summarization: {str(e)}"
222
+ update_log(err)
223
+ return err
224
 
225
  def answer_query(query):
226
+ if 'pdf_qa' not in globals():
227
+ return "Error: PDF embeddings not generated. Load a PDF first."
228
  try:
229
  response = pdf_qa.run(query)
230
  update_log(f"Query answered: {query}")
231
  return response
232
  except Exception as e:
233
+ err = f"Error in answering query: {str(e)}"
234
+ update_log(err)
235
+ return err
236
 
237
  def get_log():
238
  return log_messages
239
 
240
+ # Define simple CSS and title HTML
241
  css = """
242
+ #col-container {max-width: 700px; margin: auto;}
243
  """
244
 
245
  title = """
246
+ <div style="text-align: center;">
247
  <h1>AskMoli - Chatbot for PDFs</h1>
248
+ <p>Upload a PDF and generate embeddings. Then ask questions or use a predefined set.</p>
249
  </div>
250
  """
251
 
252
  # Build the Gradio interface
253
  with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
254
+ with gr.Column(id="col-container"):
255
  gr.HTML(title)
256
 
257
  with gr.Tab("Chatbot"):
258
  with gr.Column():
259
+ open_ai_key = gr.Textbox(label="Your GPT-4 API Key", type="password")
260
  pdf_doc = gr.File(label="Load a PDF", file_types=['.pdf'], type='filepath')
261
+ relevant_pages = gr.Textbox(label="Optional: Comma separated page numbers")
262
 
263
  with gr.Row():
264
  status = gr.Textbox(label="Status", interactive=False)
265
+ load_pdf_btn = gr.Button("Upload PDF & Generate Embeddings")
266
 
267
  with gr.Row():
268
  summary = gr.Textbox(label="Summary")
269
  summarize_pdf_btn = gr.Button("Summarize Contents")
270
 
271
  with gr.Row():
272
+ input_query = gr.Textbox(label="Your Question")
273
  output_answer = gr.Textbox(label="Answer")
274
+ submit_query_btn = gr.Button("Submit Question")
275
 
276
  with gr.Row():
277
  questionsets = gr.Dropdown(label="Pre-defined Question Sets", choices=[])
278
+ load_questionsets_btn = gr.Button("Retrieve Sets")
279
+ fields_and_questions = gr.Dataframe(label="Fields & Questions")
280
+ load_fields_btn = gr.Button("Retrieve Questions")
281
+
282
  with gr.Row():
283
+ answers_df = gr.Dataframe(label="Pre-defined Answers")
284
+ answer_predefined_btn = gr.Button("Get Answers")
285
 
286
+ # Log window to display errors and info
287
  log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
288
 
289
  with gr.Tab("OCR Converter"):
 
295
 
296
  with gr.Tab("Upload Question Set"):
297
  with gr.Column():
298
+ # Now only two document types are available
299
+ document_type_for_questionset = gr.Dropdown(choices=["DOC_A", "DOC_B"], label="Select Document Type")
300
+ tag_for_questionset = gr.Textbox(label="Name for Question Set (e.g., basic-set)")
301
+ csv_file = gr.File(label="Load CSV (fields,question)", file_types=['.csv'], type='filepath')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
302
  with gr.Row():
303
  status_for_csv = gr.Textbox(label="Status", interactive=False)
304
  load_csv_btn = gr.Button("Upload CSV into DB")
305
 
306
+ refresh_log_btn = gr.Button("Refresh Log")
307
+ refresh_log_btn.click(get_log, outputs=log_window)
308
+
309
  # Set up button actions
310
  load_pdf_btn.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
311
  summarize_pdf_btn.click(summarize_contents, outputs=summary)
 
317
 
318
  convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
319
  load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
 
 
 
 
320
 
321
  # Launch the Gradio app
322
  demo.launch(debug=True)