Spaces:

Omarrran
/

ChromaDB_HNM

Sleeping

App Files Files Community

Omarrran commited on Mar 21

Commit

8c65552

verified ·

1 Parent(s): 26cbdf6

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -282

app.py CHANGED Viewed

@@ -9,15 +9,15 @@ import logging
 from langchain.document_loaders import OnlinePDFLoader  # for loading the pdf
 from langchain.embeddings import HuggingFaceEmbeddings  # open source embedding model
 from langchain.text_splitter import CharacterTextSplitter
-from langchain.vectorstores import Chroma  # for the vectorization part
-from langchain.chains import RetrievalQA  # for conversing with ChatGPT
-from langchain.chat_models import ChatOpenAI  # the LLM model we'll use (ChatGPT)
-from langchain_core.prompts import PromptTemplate  # updated import per warning
 # Setup basic logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-log_messages = ""  # Global log collector
 def update_log(message):
     global log_messages
@@ -27,8 +27,8 @@ def update_log(message):
 def ocr_converter(input_file):
     image_pdf = input_file.name
     try:
-        # Specify output_type="pdf" to bypass Ghostscript issues.
-        ocrmypdf.ocr(image_pdf, image_pdf, redo_ocr=True, language="eng", output_type="pdf")
         update_log(f"OCR conversion successful for {image_pdf}")
     except Exception as e:
         error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
@@ -40,50 +40,31 @@ def load_pdf_and_generate_embeddings(pdf_doc, open_ai_key, relevant_pages):
     try:
         if open_ai_key is not None:
             os.environ['OPENAI_API_KEY'] = open_ai_key
-        # Perform OCR conversion; errors here will be logged.
         pdf_doc = ocr_converter(pdf_doc)
-        # Load the PDF file
         loader = OnlinePDFLoader(pdf_doc)
         pages = loader.load_and_split()
         update_log(f"Loaded {len(pages)} pages from {pdf_doc}")
-        # Use HuggingFaceEmbeddings (open source) for generating embeddings.
         embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         pages_to_be_loaded = []
         if relevant_pages:
-            page_numbers = relevant_pages.split(",")
-            for page_number in page_numbers:
-                if page_number.strip().isdigit():
-                    pageIndex = int(page_number.strip()) - 1
                     if 0 <= pageIndex < len(pages):
                         pages_to_be_loaded.append(pages[pageIndex])
         if not pages_to_be_loaded:
             pages_to_be_loaded = pages.copy()
             update_log("No specific pages selected; using entire PDF.")
-        # Create a vector store using Chroma with the embeddings.
         vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
-        # Configure the prompt template for the QA chain.
         prompt_template = (
-            """Use the following pieces of context to answer the question at the end. If you do not know the answer, just return N/A.
-            If you encounter a date, return it in mm/dd/yyyy format. If there is a Preface section in the document, extract the chapter# and the short description from the Preface.
-            Chapter numbers are listed to the left in Preface and always start with an alphabet, for example A1-1.
             {context}
             Question: {question}
-            Return the answer. Provide the answer in the JSON format and extract the key from the question. Where applicable, break the answer into bullet points.
-            When the sentences are long, try and break them into sub sections and include all the information and do not skip any information.
-            If there is an exception to the answer, please do include it in a 'Note:' section. If there are no exceptions to the answer, please skip the 'Note:' section.
-            Include a 'For additional details refer to' section when the document has more information to offer on the topic being questioned.
-            If the document has a Preface or 'Table of Contents' section, extract the chapter# and a short description and include the info under the 'For additional details refer to' section.
-            List only the chapters that contain information or skip this section altogether. Do not use page numbers as chapter numbers as they are different.
-            If additional information is found in multiple pages within the same chapter, list the chapter only once.
-            If chapter information cannot be extracted, include any other information that will help the user navigate to the relevant sections of the document.
-            If the document does not contain a Preface or 'Table of Contents' section, please do not call that out."""
         )
         PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
         chain_type_kwargs = {"prompt": PROMPT}
@@ -124,120 +105,70 @@ def create_sqlite_table(connection):
 def load_master_questionset_into_sqlite(connection):
     create_sqlite_table(connection)
     cursor = connection.cursor()
-    masterlist_for_DOT_count = cursor.execute(
         "SELECT COUNT(document_type) FROM questions WHERE document_type=? AND questionset_tag=?",
-        ("DOT", "masterlist",)
     ).fetchone()[0]
-    if masterlist_for_DOT_count == 0:
-        update_log("Loading DOT masterlist into DB.")
-        fieldListForDOT, queryListForDOT = create_field_and_question_list_for_DOT()
-        fieldListForTransmittalSummary, queryListForTransmittalSummary = create_field_and_question_list_for_Transmittal_Summary()
-        for i in range(len(queryListForDOT)):
             cursor.execute(
                 "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
-                ["DOT", "masterlist", fieldListForDOT[i], queryListForDOT[i]]
             )
-        for i in range(len(queryListForTransmittalSummary)):
             cursor.execute(
                 "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
-                ["Transmittal Summary", "masterlist", fieldListForTransmittalSummary[i], queryListForTransmittalSummary[i]]
             )
         connection.commit()
     total_questions = cursor.execute("SELECT COUNT(document_type) FROM questions").fetchone()[0]
     update_log(f"Total questions in DB: {total_questions}")
-def create_field_and_question_list_for_DOT():
-    queryList = [
-        "what is the Loan Number?",
-        "Who is the Borrower?",
-        "what is the Case Number?",
-        "what is the Mortgage Identification number?",
-        "DOT signed date?",
-        "Who is the Lender?",
-        "what is the VA/FHA Number?",
-        "Who is the Co-Borrower?",
-        "What is the property type - single family, multi family?",
-        "what is the Property Address?",
-        "In what County is the property located?",
-        "what is the Electronically recorded date"
-    ]
-    fieldList = [
-        "Loan Number",
-        "Borrower",
-        "Case Number",
-        "MIN Number",
-        "Signed Date",
-        "Lender",
-        "VA/FHA Number",
-        "Co-Borrower",
-        "Property Type",
-        "Property Address",
-        "Property County",
-        "Electronic Recording Date"
-    ]
-    return fieldList, queryList
-def create_field_and_question_list_for_Transmittal_Summary():
-    queryList = [
-        "Who is the Borrower?",
-        "what is the Property Address?",
-        "what is the Loan Term?",
-        "What is the Base Income?",
-        "what is the Borrower's SSN?",
-        "Who is the Co-Borrower?",
-        "What is the Original Loan Amount?",
-        "What is the Initial P&I payment?",
-        "What is the Co-Borrower's SSN?",
-        "Number of units?",
-        "Who is the Seller?",
-        "Document signed date?"
-    ]
-    fieldList = [
-        "Borrower",
-        "Property Address",
-        "Loan Term",
-        "Base Income",
-        "Borrower's SSN",
-        "Co-Borrower",
-        "Original Loan Amount",
-        "Initial P&I payment",
-        "Co-Borrower’s SSN",
-        "Units#",
-        "Seller",
-        "Signed Date"
-    ]
-    return fieldList, queryList
 def retrieve_document_type_and_questionsettag_from_sqlite():
     connection = create_db_connection()
     load_master_questionset_into_sqlite(connection)
     cursor = connection.cursor()
     rows = cursor.execute("SELECT document_type, questionset_tag FROM questions ORDER BY document_type, UPPER(questionset_tag)").fetchall()
-    list_for_dropdown = []
-    for i in rows:
-        concatenated_value = f"{i[0]}:{i[1]}"
-        if concatenated_value not in list_for_dropdown:
-            list_for_dropdown.append(concatenated_value)
-            update_log(f"Found question set: {concatenated_value}")
     connection.close()
-    return gr.Dropdown.update(choices=list_for_dropdown, value=list_for_dropdown[0])
 def retrieve_fields_and_questions(dropdownoption):
     splitwords = dropdownoption.split(":")
     connection = create_db_connection()
     cursor = connection.cursor()
-    fields_and_questions = cursor.execute(
         "SELECT document_type, field, question FROM questions WHERE document_type=? AND questionset_tag=?",
         (splitwords[0], splitwords[1],)
     ).fetchall()
     connection.close()
-    return pd.DataFrame(fields_and_questions, columns=["documentType", "field", "question"])
 def add_questionset(data, document_type, tag_for_questionset):
     connection = create_db_connection()
     create_sqlite_table(connection)
     cursor = connection.cursor()
-    for index, row in data.iterrows():
         cursor.execute(
             "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
             [document_type, tag_for_questionset, row['field'], row['question']]
@@ -249,12 +180,12 @@ def load_csv_and_store_questionset_into_sqlite(csv_file, document_type, tag_for_
     if tag_for_questionset and document_type:
         data = pd.read_csv(csv_file.name)
         add_questionset(data, document_type, tag_for_questionset)
-        responseString = f"Task Complete. Uploaded {data.shape[0]} fields and corresponding questions for {document_type}:{tag_for_questionset}"
-        update_log(responseString)
-        return responseString
     else:
-        return "Please select the Document Type and provide a name for the Question Set"
 def answer_predefined_questions(document_type_and_questionset):
     splitwords = document_type_and_questionset.split(":")
     document_type = splitwords[0]
@@ -264,92 +195,95 @@ def answer_predefined_questions(document_type_and_questionset):
     cursor = connection.cursor()
     rows = cursor.execute(
         "SELECT field, question FROM questions WHERE document_type=? AND questionset_tag=?",
-        (document_type, question_set,)
     ).fetchall()
     connection.close()
-    for entry in rows:
-        fields.append(entry[0])
-        questions.append(entry[1])
-        # Call pdf_qa.run only if pdf_qa is defined
         try:
-            responses.append(pdf_qa.run(entry[1]))
         except Exception as e:
-            error_str = f"Error in pdf_qa.run for question '{entry[1]}': {str(e)}"
-            update_log(error_str)
-            responses.append(error_str)
-    return pd.DataFrame({"Field": fields, "Question to gpt-4": questions, "Response from gpt-4": responses})
 def summarize_contents():
-    question = "Generate a short summary of the contents along with no more than 3 leading/example questions. Do not return the response in json format"
     try:
         response = pdf_qa.run(question)
         update_log("Summarization successful.")
         return response
     except Exception as e:
-        error_str = f"Error in summarization: {str(e)}"
-        update_log(error_str)
-        return error_str
 def answer_query(query):
     try:
         response = pdf_qa.run(query)
         update_log(f"Query answered: {query}")
         return response
     except Exception as e:
-        error_str = f"Error in answering query: {str(e)}"
-        update_log(error_str)
-        return error_str
 def get_log():
     return log_messages
-# Define CSS and title HTML
 css = """
-#col-container {max-width: 700px; margin-left: auto; margin-right: auto;}
 """
 title = """
-<div style="text-align: center; max-width: 700px;">
     <h1>AskMoli - Chatbot for PDFs</h1>
-    <p>Upload a .PDF and click "Upload PDF and generate embeddings". Wait for the status to show "Ready". Then either choose a pre-defined question set or ask your own question. The app uses GPT-4 with a custom prompt template.</p>
 </div>
 """
 # Build the Gradio interface
 with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
-    with gr.Column(elem_id="col-container"):
         gr.HTML(title)
     with gr.Tab("Chatbot"):
         with gr.Column():
-            open_ai_key = gr.Textbox(label="Your GPT-4 OpenAI API key", type="password")
             pdf_doc = gr.File(label="Load a PDF", file_types=['.pdf'], type='filepath')
-            relevant_pages = gr.Textbox(label="*Optional - Comma separated page numbers (leave blank for entire PDF)")
             with gr.Row():
                 status = gr.Textbox(label="Status", interactive=False)
-                load_pdf_btn = gr.Button("Upload PDF and generate embeddings")
             with gr.Row():
                 summary = gr.Textbox(label="Summary")
                 summarize_pdf_btn = gr.Button("Summarize Contents")
             with gr.Row():
-                input_query = gr.Textbox(label="Type your question")
                 output_answer = gr.Textbox(label="Answer")
-                submit_query_btn = gr.Button("Submit your question")
             with gr.Row():
                 questionsets = gr.Dropdown(label="Pre-defined Question Sets", choices=[])
-                load_questionsets_btn = gr.Button("Retrieve Question Sets")
-                fields_and_questions = gr.Dataframe(label="Fields & Questions in the chosen set")
-                load_fields_btn = gr.Button("Retrieve Questions for chosen set")
             with gr.Row():
-                answers_df = gr.Dataframe(label="Answers to Pre-defined Question Set")
-                answer_predefined_btn = gr.Button("Get answers for chosen question set")
-            # Log window for error and info messages
             log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
     with gr.Tab("OCR Converter"):
@@ -361,138 +295,17 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
     with gr.Tab("Upload Question Set"):
         with gr.Column():
-            document_types = [
-                "Mortgage 1040 US Individual Tax Returns 8453 Elec Form",
-                "Mortgage 1098",
-                "Mortgage 1099",
-                "Mortgage Abstract",
-                "Mortgage ACH Authorization Form",
-                "Mortgage Advance Fee Agreement",
-                "Mortgage Affidavit",
-                "Mortgage Affidavit of Suspense Funds",
-                "Mortgage Agreement Documents",
-                "Mortgage Sales Contract",
-                "Mortgage Loan Estimate",
-                "Mortgage Alimony Or Child Support",
-                "Mortgage Amended Proof Of Claim",
-                "Mortgage Amortization Schedule",
-                "Mortgage Flood Insurance",
-                "Mortgage Appraisal Report",
-                "Mortgage Appraisal Disclosure",
-                "Mortgage ARM Letter",
-                "Mortgage Arms Length Affidavit",
-                "Mortgage Assignment-Recorded",
-                "Mortgage Assignment-Unrecorded",
-                "Mortgage Assignment of Rent or Lease",
-                "Mortgage Automated Value Model",
-                "Mortgage Award Letters",
-                "Mortgage Bailee Letter",
-                "Mortgage Balloon Disclosure",
-                "Mortgage Bank Statement",
-                "Mortgage Bankruptcy Documents",
-                "Mortgage Bill of Sale",
-                "Mortgage Billing Statement",
-                "Mortgage Birth-Marriage-Death Certificate",
-                "Mortgage Borrower Certification Authorization",
-                "Mortgage Borrower Response Package",
-                "Mortgage Brokers Price Opinion",
-                "Mortgage Business Plan",
-                "Mortgage Buydown Agreement",
-                "Mortgage Bylaws Covenants Conditions Restrictions",
-                "Mortgage Cash for Keys",
-                "Mortgage Certificate of Redemption",
-                "Mortgage Certificate of Sale",
-                "Mortgage Certificate of Title",
-                "Mortgage Certification of Amount Due Payoff Reinstatement",
-                "Mortgage Checks-Regular or Cashiers",
-                "Mortgage Closing Disclosure",
-                "Mortgage Closing Protection Letter",
-                "Mortgage Closing Other",
-                "Mortgage Code Violations",
-                "Mortgage Request for Release",
-                "Mortgage Certificate of Liability Insurance",
-                "Mortgage Commitment Letter",
-                "Mortgage Complaint",
-                "Mortgage Complaint Answer Counter Claim",
-                "Mortgage Conditional Approval Letter",
-                "Mortgage Conditional Commitment",
-                "Mortgage Consent Order",
-                "Mortgage Consolidated Mortgage CEMA",
-                "Mortgage Conveyance Claims",
-                "Mortgage Correction and Revision Agreement",
-                "Mortgage Correspondence",
-                "Mortgage Court Order Settlement Divorce Decree",
-                "Mortgage Credit Report",
-                "Mortgage Customer Signature Authorization",
-                "Mortgage Debt Validation",
-                "Mortgage Deed",
-                "Mortgage Default Notices",
-                "Mortgage Direct Debit Authorization Form",
-                "Mortgage Disclosure Documents",
-                "Mortgage Document Checklist",
-                "Mortgage Document Correction and Fee Due Agreement",
-                "Mortgage Dodd Frank Certification",
-                "Mortgage Drivers License",
-                "Mortgage Request for VOE",
-                "Mortgage Environmental Indemnity Agreement",
-                "Mortgage Equal Credit Opportunity Act Notice",
-                "Mortgage Escrow Agreement",
-                "Mortgage Escrow Analysis Trial Balance Worksheet",
-                "Mortgage Instructions to Escrow Agent",
-                "Mortgage Escrow Letters",
-                "Mortgage Executed Deeds",
-                "Mortgage Fair Lending Notice",
-                "Mortgage Foreclosure Complaint",
-                "Mortgage Foreclosure Judgement",
-                "Mortgage Foreclosure Sale",
-                "Mortgage FHA Neighborhood Watch",
-                "Mortgage Truth-In-Lending Disclosure Statement",
-                "Mortgage Financial Form",
-                "Mortgage Financing Agreement",
-                "Mortgage First Payment Letter",
-                "Mortgage Forced Place Insurance Documents",
-                "Mortgage Foreclosure Documents",
-                "Mortgage Good Faith Estimate",
-                "Mortgage Guaranty",
-                "Mortgage HAMP Certifications",
-                "Mortgage HOA-Condo Covenants and Dues",
-                "Mortgage Exemption Hold Harmless Letter",
-                "Mortgage Home Equity Signature Verification Card",
-                "Mortgage Home Inspection",
-                "Mortgage Property Liability Insurance",
-                "Mortgage Homeowners Insurance Notice",
-                "Mortgage HUD-1 Settlement Statement",
-                "Mortgage Income Other",
-                "Mortgage Indemnity Agreement",
-                "Mortgage Informed Consumer Choice Disclosure Notice",
-                "Mortgage Initial Escrow Account Disclosure Statement",
-                "Mortgage Invoices",
-                "Mortgage Land Lease or Land Trust",
-                "Mortgage Land Title Adjustment",
-                "Mortgage Last Will and Testament",
-                "Mortgage Legal Description",
-                "Mortgage Letters Of Administration",
-                "Mortgage Letters of Testamentary",
-                "Mortgage Listing Agreement",
-                "Mortgage Litigation Guarantee",
-                "Mortgage DIL Closing",
-                "Mortgage Hardship Letter",
-                "Mortgage Hardship Affidavit",
-                "Mortgage Home Affordable Modification Agreement",
-                "Mortgage Profit And Loss",
-                "Mortgage Earnest Money Promissory Note",
-                "Mortgage Rental Agreement",
-                "Mortgage Repayment Plan",
-                "Mortgage Short Sale Miscellaneous"
-            ]
-            document_type_for_questionset = gr.Dropdown(choices=document_types, label="Select Document Type")
-            tag_for_questionset = gr.Textbox(label="Name for Question Set (e.g., rwikd-dot-basic-questionset-20230707)")
-            csv_file = gr.File(label="Load CSV (2 columns: field, question)", file_types=['.csv'], type='filepath')
             with gr.Row():
                 status_for_csv = gr.Textbox(label="Status", interactive=False)
                 load_csv_btn = gr.Button("Upload CSV into DB")
     # Set up button actions
     load_pdf_btn.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
     summarize_pdf_btn.click(summarize_contents, outputs=summary)
@@ -504,10 +317,6 @@ with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
     convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
     load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
-    # Button to refresh the log window
-    refresh_log_btn = gr.Button("Refresh Log")
-    refresh_log_btn.click(get_log, outputs=log_window)
 # Launch the Gradio app
 demo.launch(debug=True)

 from langchain.document_loaders import OnlinePDFLoader  # for loading the pdf
 from langchain.embeddings import HuggingFaceEmbeddings  # open source embedding model
 from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import Chroma  # for vectorization
+from langchain.chains import RetrievalQA  # for QA chain
+from langchain.chat_models import ChatOpenAI  # ChatGPT model
+from langchain_core.prompts import PromptTemplate  # prompt template import
 # Setup basic logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+log_messages = ""  # global log collector
 def update_log(message):
     global log_messages
 def ocr_converter(input_file):
     image_pdf = input_file.name
     try:
+        # Use force_ocr=True and output_type="pdf" to work around Ghostscript issues.
+        ocrmypdf.ocr(image_pdf, image_pdf, redo_ocr=True, force_ocr=True, language="eng", output_type="pdf")
         update_log(f"OCR conversion successful for {image_pdf}")
     except Exception as e:
         error_msg = f"OCR conversion failed for {image_pdf}. Error: {str(e)}"
     try:
         if open_ai_key is not None:
             os.environ['OPENAI_API_KEY'] = open_ai_key
         pdf_doc = ocr_converter(pdf_doc)
         loader = OnlinePDFLoader(pdf_doc)
         pages = loader.load_and_split()
         update_log(f"Loaded {len(pages)} pages from {pdf_doc}")
         embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
         pages_to_be_loaded = []
         if relevant_pages:
+            for page in relevant_pages.split(","):
+                if page.strip().isdigit():
+                    pageIndex = int(page.strip()) - 1
                     if 0 <= pageIndex < len(pages):
                         pages_to_be_loaded.append(pages[pageIndex])
         if not pages_to_be_loaded:
             pages_to_be_loaded = pages.copy()
             update_log("No specific pages selected; using entire PDF.")
         vectordb = Chroma.from_documents(pages_to_be_loaded, embedding=embeddings)
         prompt_template = (
+            """Use the following context to answer the question. If you do not know the answer, return N/A.
             {context}
             Question: {question}
+            Return the answer in JSON format."""
         )
         PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
         chain_type_kwargs = {"prompt": PROMPT}
 def load_master_questionset_into_sqlite(connection):
     create_sqlite_table(connection)
     cursor = connection.cursor()
+    masterlist_count = cursor.execute(
         "SELECT COUNT(document_type) FROM questions WHERE document_type=? AND questionset_tag=?",
+        ("DOC_A", "masterlist",)
     ).fetchone()[0]
+    if masterlist_count == 0:
+        update_log("Loading masterlist into DB.")
+        fields, queries = create_field_and_question_list_for_DOC_A()
+        for i in range(len(queries)):
             cursor.execute(
                 "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
+                ["DOC_A", "masterlist", fields[i], queries[i]]
             )
+        fields2, queries2 = create_field_and_question_list_for_DOC_B()
+        for i in range(len(queries2)):
             cursor.execute(
                 "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
+                ["DOC_B", "masterlist", fields2[i], queries2[i]]
             )
         connection.commit()
     total_questions = cursor.execute("SELECT COUNT(document_type) FROM questions").fetchone()[0]
     update_log(f"Total questions in DB: {total_questions}")
+def create_field_and_question_list_for_DOC_A():
+    # Only two sample entries
+    fields = ["Loan Number", "Borrower"]
+    queries = ["What is the Loan Number?", "Who is the Borrower?"]
+    return fields, queries
+def create_field_and_question_list_for_DOC_B():
+    # Only two sample entries
+    fields = ["Property Address", "Signed Date"]
+    queries = ["What is the Property Address?", "What is the Signed Date?"]
+    return fields, queries
 def retrieve_document_type_and_questionsettag_from_sqlite():
     connection = create_db_connection()
     load_master_questionset_into_sqlite(connection)
     cursor = connection.cursor()
     rows = cursor.execute("SELECT document_type, questionset_tag FROM questions ORDER BY document_type, UPPER(questionset_tag)").fetchall()
+    choices = []
+    for row in rows:
+        value = f"{row[0]}:{row[1]}"
+        if value not in choices:
+            choices.append(value)
+            update_log(f"Found question set: {value}")
     connection.close()
+    return gr.Dropdown.update(choices=choices, value=choices[0] if choices else "")
 def retrieve_fields_and_questions(dropdownoption):
     splitwords = dropdownoption.split(":")
     connection = create_db_connection()
     cursor = connection.cursor()
+    rows = cursor.execute(
         "SELECT document_type, field, question FROM questions WHERE document_type=? AND questionset_tag=?",
         (splitwords[0], splitwords[1],)
     ).fetchall()
     connection.close()
+    return pd.DataFrame(rows, columns=["documentType", "field", "question"])
 def add_questionset(data, document_type, tag_for_questionset):
     connection = create_db_connection()
     create_sqlite_table(connection)
     cursor = connection.cursor()
+    for _, row in data.iterrows():
         cursor.execute(
             "INSERT INTO questions(document_type, questionset_tag, field, question) VALUES(?,?,?,?)",
             [document_type, tag_for_questionset, row['field'], row['question']]
     if tag_for_questionset and document_type:
         data = pd.read_csv(csv_file.name)
         add_questionset(data, document_type, tag_for_questionset)
+        response = f"Uploaded {data.shape[0]} fields and questions for {document_type}:{tag_for_questionset}"
+        update_log(response)
+        return response
     else:
+        return "Please select a Document Type and provide a name for the Question Set"
 def answer_predefined_questions(document_type_and_questionset):
     splitwords = document_type_and_questionset.split(":")
     document_type = splitwords[0]
     cursor = connection.cursor()
     rows = cursor.execute(
         "SELECT field, question FROM questions WHERE document_type=? AND questionset_tag=?",
+        (document_type, question_set)
     ).fetchall()
     connection.close()
+    for field, question in rows:
+        fields.append(field)
+        questions.append(question)
         try:
+            responses.append(pdf_qa.run(question))
         except Exception as e:
+            err = f"Error: {str(e)}"
+            update_log(err)
+            responses.append(err)
+    return pd.DataFrame({"Field": fields, "Question": questions, "Response": responses})
 def summarize_contents():
+    question = "Generate a short summary of the contents along with up to 3 example questions."
+    if 'pdf_qa' not in globals():
+        return "Error: PDF embeddings not generated. Load a PDF first."
     try:
         response = pdf_qa.run(question)
         update_log("Summarization successful.")
         return response
     except Exception as e:
+        err = f"Error in summarization: {str(e)}"
+        update_log(err)
+        return err
 def answer_query(query):
+    if 'pdf_qa' not in globals():
+        return "Error: PDF embeddings not generated. Load a PDF first."
     try:
         response = pdf_qa.run(query)
         update_log(f"Query answered: {query}")
         return response
     except Exception as e:
+        err = f"Error in answering query: {str(e)}"
+        update_log(err)
+        return err
 def get_log():
     return log_messages
+# Define simple CSS and title HTML
 css = """
+#col-container {max-width: 700px; margin: auto;}
 """
 title = """
+<div style="text-align: center;">
     <h1>AskMoli - Chatbot for PDFs</h1>
+    <p>Upload a PDF and generate embeddings. Then ask questions or use a predefined set.</p>
 </div>
 """
 # Build the Gradio interface
 with gr.Blocks(css=css, theme=gr.themes.Monochrome()) as demo:
+    with gr.Column(id="col-container"):
         gr.HTML(title)
     with gr.Tab("Chatbot"):
         with gr.Column():
+            open_ai_key = gr.Textbox(label="Your GPT-4 API Key", type="password")
             pdf_doc = gr.File(label="Load a PDF", file_types=['.pdf'], type='filepath')
+            relevant_pages = gr.Textbox(label="Optional: Comma separated page numbers")
             with gr.Row():
                 status = gr.Textbox(label="Status", interactive=False)
+                load_pdf_btn = gr.Button("Upload PDF & Generate Embeddings")
             with gr.Row():
                 summary = gr.Textbox(label="Summary")
                 summarize_pdf_btn = gr.Button("Summarize Contents")
             with gr.Row():
+                input_query = gr.Textbox(label="Your Question")
                 output_answer = gr.Textbox(label="Answer")
+                submit_query_btn = gr.Button("Submit Question")
             with gr.Row():
                 questionsets = gr.Dropdown(label="Pre-defined Question Sets", choices=[])
+                load_questionsets_btn = gr.Button("Retrieve Sets")
+                fields_and_questions = gr.Dataframe(label="Fields & Questions")
+                load_fields_btn = gr.Button("Retrieve Questions")
             with gr.Row():
+                answers_df = gr.Dataframe(label="Pre-defined Answers")
+                answer_predefined_btn = gr.Button("Get Answers")
+            # Log window to display errors and info
             log_window = gr.Textbox(label="Log Window", interactive=False, lines=10)
     with gr.Tab("OCR Converter"):
     with gr.Tab("Upload Question Set"):
         with gr.Column():
+            # Now only two document types are available
+            document_type_for_questionset = gr.Dropdown(choices=["DOC_A", "DOC_B"], label="Select Document Type")
+            tag_for_questionset = gr.Textbox(label="Name for Question Set (e.g., basic-set)")
+            csv_file = gr.File(label="Load CSV (fields,question)", file_types=['.csv'], type='filepath')
             with gr.Row():
                 status_for_csv = gr.Textbox(label="Status", interactive=False)
                 load_csv_btn = gr.Button("Upload CSV into DB")
+    refresh_log_btn = gr.Button("Refresh Log")
+    refresh_log_btn.click(get_log, outputs=log_window)
     # Set up button actions
     load_pdf_btn.click(load_pdf_and_generate_embeddings, inputs=[pdf_doc, open_ai_key, relevant_pages], outputs=status)
     summarize_pdf_btn.click(summarize_contents, outputs=summary)
     convert_btn.click(ocr_converter, inputs=image_pdf, outputs=ocr_pdf)
     load_csv_btn.click(load_csv_and_store_questionset_into_sqlite, inputs=[csv_file, document_type_for_questionset, tag_for_questionset], outputs=status_for_csv)
 # Launch the Gradio app
 demo.launch(debug=True)