Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

seanpedrickcase commited on Apr 23

Commit

0c818aa

1 Parent(s): a73f005

Returned to bge-base embeddings for now. Improved UI a little

Browse files

Files changed (3) hide show

app.py +29 -29
chatfuncs/ingest.py +27 -32
chatfuncs/ingest_borough_plan.py +1 -1

app.py CHANGED Viewed

@@ -45,7 +45,7 @@ import chatfuncs.ingest as ing
 # Load preset embeddings, vectorstore, and model
 ###
-embeddings_name = "mixedbread-ai/mxbai-embed-xsmall-v1" #"BAAI/bge-base-en-v1.5"
 def load_embeddings(embeddings_name = embeddings_name):
@@ -185,7 +185,7 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
 # RUN UI
 ###
-app = gr.Blocks(theme = gr.themes.Base())#css=".gradio-container {background-color: black}")
 with app:
     ingest_text = gr.State()
@@ -243,9 +243,9 @@ with app:
                 lines=1,
             )
         with gr.Row():
-            submit = gr.Button(value="Send message", variant="secondary", scale = 1)
-            clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
-            stop = gr.Button(value="Stop generating", variant="secondary", scale=0)
         examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
             #value = "What were the five pillars of the previous borough plan?",
@@ -296,45 +296,45 @@ with app:
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
     change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
-    then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
-    then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
-    then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
-    then(lambda: None, None, chatbot, queue=False)
     # Load in a pdf
     load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
-             then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
-             then(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
-             then(chatf.hide_block, outputs = [examples_set])
     # Load in a webpage
     load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
-             then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
-             then(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
-             then(chatf.hide_block, outputs = [examples_set])
     # Load in a csv/excel file
     load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
-             then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
-             then(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
-             then(chatf.hide_block, outputs = [examples_set])
     # Load in a webpage
     # Click/enter to send message action
     response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
-                then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
-                then(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], outputs=chatbot)
-    response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
-                then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
-                then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
     response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
-                then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
-                then(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], chatbot)
-    response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
-                then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
-                then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
     # Stop box
     stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
@@ -356,7 +356,7 @@ with app:
     access_callback.setup([session_hash_textbox], access_logs_data_folder)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')

 # Load preset embeddings, vectorstore, and model
 ###
+embeddings_name =  "BAAI/bge-base-en-v1.5" #"mixedbread-ai/mxbai-embed-xsmall-v1"
 def load_embeddings(embeddings_name = embeddings_name):
 # RUN UI
 ###
+app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)#css=".gradio-container {background-color: black}")
 with app:
     ingest_text = gr.State()
                 lines=1,
             )
         with gr.Row():
+            submit = gr.Button(value="Send message", variant="primary", scale = 4)
+            clear = gr.Button(value="Clear chat", variant="secondary", scale=1)
+            stop = gr.Button(value="Stop generating", variant="secondary", scale=1)
         examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
             #value = "What were the five pillars of the previous borough plan?",
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
     change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
+    success(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
+    success(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
+    success(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
+    success(lambda: None, None, chatbot, queue=False)
     # Load in a pdf
     load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
+             success(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
+             success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
+             success(chatf.hide_block, outputs = [examples_set])
     # Load in a webpage
     load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
+             success(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
+             success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
+             success(chatf.hide_block, outputs = [examples_set])
     # Load in a csv/excel file
     load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
+             success(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
+             success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
+             success(chatf.hide_block, outputs = [examples_set])
     # Load in a webpage
     # Click/enter to send message action
     response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
+                success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
+                success(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], outputs=chatbot)
+    response_click.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
+                success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
+                success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
     response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
+                success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
+                success(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], chatbot)
+    response_enter.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
+                success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
+                success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
     # Stop box
     stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
     access_callback.setup([session_hash_textbox], access_logs_data_folder)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+    success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')

chatfuncs/ingest.py CHANGED Viewed

@@ -7,6 +7,7 @@ import requests
 import pandas as pd
 import dateutil.parser
 from typing import Type, List
 from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
 from langchain_community.vectorstores.faiss import FAISS
@@ -573,56 +574,50 @@ def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
     return embeddings_func
-def embed_faiss_save_to_zip(docs_out, save_to="output", model_name = "BAAI/bge-base-en-v1.5"):
     load_embeddings(model_name=model_name)
-    #embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     print(f"> Total split documents: {len(docs_out)}")
     vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
-    if not Path(save_to).exists():
-        os.mkdir(save_to)
-    if Path(save_to).exists():
-        vectorstore.save_local(folder_path=save_to)
-    print("> DONE")
     print(f"> Saved to: {save_to}")
-    ### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
-    import shutil
-    shutil.make_archive(save_to, 'zip', save_to)
-    os.remove(save_to + "/index.faiss")
-    os.remove(save_to + "/index.pkl")
-    save_zip_out = save_to + "/" + save_to + '.zip'
-    shutil.move(save_to + '.zip', save_zip_out)
-    out_message = "Document processing complete"
-    return out_message, vectorstore, save_zip_out
-def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
-    print(f"> Total split documents: {len(docs_out)}")
-    vectordb = Chroma.from_documents(documents=docs_out,
-                                 embedding=embeddings,
-                                 persist_directory=save_to)
-    # persiste the db to disk
-    vectordb.persist()
-    print("> DONE")
-    print(f"> Saved to: {save_to}")
-    return vectordb
 def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):

 import pandas as pd
 import dateutil.parser
 from typing import Type, List
+import shutil
 from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
 from langchain_community.vectorstores.faiss import FAISS
     return embeddings_func
+def embed_faiss_save_to_zip(docs_out, save_to="output", model_name="BAAI/bge-base-en-v1.5"):
     load_embeddings(model_name=model_name)
     print(f"> Total split documents: {len(docs_out)}")
     vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
+    save_to_path = Path(save_to)
+    save_to_path.mkdir(parents=True, exist_ok=True)
+    vectorstore.save_local(folder_path=str(save_to_path))
+    print("> FAISS index saved")
     print(f"> Saved to: {save_to}")
+    # Ensure files are written before archiving
+    index_faiss = save_to_path / "index.faiss"
+    index_pkl = save_to_path / "index.pkl"
+    if not index_faiss.exists() or not index_pkl.exists():
+        raise FileNotFoundError("Expected FAISS index files not found before zipping.")
+    # Flush file system writes by forcing a sync (works best on Unix)
+    try:
+        os.sync()
+    except AttributeError:
+        pass  # os.sync() not available on Windows
+    # Create ZIP archive
+    final_zip_path = shutil.make_archive(str(save_to_path), 'zip', root_dir=str(save_to_path))
+    # Remove individual index files to avoid leaking large raw files
+    index_faiss.unlink(missing_ok=True)
+    index_pkl.unlink(missing_ok=True)
+    # Move ZIP inside the folder for easier reference
+    #final_zip_path = save_to_path.with_suffix('.zip')
+    print("> Archive complete")
+    print(f"> Final ZIP path: {final_zip_path}")
+    return "Document processing complete", vectorstore, final_zip_path
 def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):

chatfuncs/ingest_borough_plan.py CHANGED Viewed

@@ -8,7 +8,7 @@ print(borough_plan_text)
 borough_plan_docs = ing.text_to_docs(borough_plan_text)
 print("Borough plan docs created")
-embedding_model = "BAAI/bge-base-en-v1.5"
 embeddings = ing.load_embeddings(model_name = embedding_model)
 ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)

 borough_plan_docs = ing.text_to_docs(borough_plan_text)
 print("Borough plan docs created")
+embedding_model =  "BAAI/bge-base-en-v1.5" # "mixedbread-ai/mxbai-embed-xsmall-v1" #
 embeddings = ing.load_embeddings(model_name = embedding_model)
 ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)