seanpedrickcase commited on
Commit
0c818aa
·
1 Parent(s): a73f005

Returned to bge-base embeddings for now. Improved UI a little

Browse files
Files changed (3) hide show
  1. app.py +29 -29
  2. chatfuncs/ingest.py +27 -32
  3. chatfuncs/ingest_borough_plan.py +1 -1
app.py CHANGED
@@ -45,7 +45,7 @@ import chatfuncs.ingest as ing
45
  # Load preset embeddings, vectorstore, and model
46
  ###
47
 
48
- embeddings_name = "mixedbread-ai/mxbai-embed-xsmall-v1" #"BAAI/bge-base-en-v1.5"
49
 
50
  def load_embeddings(embeddings_name = embeddings_name):
51
 
@@ -185,7 +185,7 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
185
  # RUN UI
186
  ###
187
 
188
- app = gr.Blocks(theme = gr.themes.Base())#css=".gradio-container {background-color: black}")
189
 
190
  with app:
191
  ingest_text = gr.State()
@@ -243,9 +243,9 @@ with app:
243
  lines=1,
244
  )
245
  with gr.Row():
246
- submit = gr.Button(value="Send message", variant="secondary", scale = 1)
247
- clear = gr.Button(value="Clear chat", variant="secondary", scale=0)
248
- stop = gr.Button(value="Stop generating", variant="secondary", scale=0)
249
 
250
  examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
251
  #value = "What were the five pillars of the previous borough plan?",
@@ -296,45 +296,45 @@ with app:
296
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
297
 
298
  change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
299
- then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
300
- then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
301
- then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
302
- then(lambda: None, None, chatbot, queue=False)
303
 
304
  # Load in a pdf
305
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
306
- then(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
307
- then(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
308
- then(chatf.hide_block, outputs = [examples_set])
309
 
310
  # Load in a webpage
311
  load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
312
- then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
313
- then(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
314
- then(chatf.hide_block, outputs = [examples_set])
315
 
316
  # Load in a csv/excel file
317
  load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
318
- then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
319
- then(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
320
- then(chatf.hide_block, outputs = [examples_set])
321
 
322
  # Load in a webpage
323
 
324
  # Click/enter to send message action
325
  response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
326
- then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
327
- then(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], outputs=chatbot)
328
- response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
329
- then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
330
- then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
331
 
332
  response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
333
- then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
334
- then(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], chatbot)
335
- response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
336
- then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
337
- then(lambda: chatf.restore_interactivity(), None, [message], queue=False)
338
 
339
  # Stop box
340
  stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
@@ -356,7 +356,7 @@ with app:
356
  access_callback.setup([session_hash_textbox], access_logs_data_folder)
357
 
358
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
359
- then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
360
 
361
  # Launch the Gradio app
362
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 
45
  # Load preset embeddings, vectorstore, and model
46
  ###
47
 
48
+ embeddings_name = "BAAI/bge-base-en-v1.5" #"mixedbread-ai/mxbai-embed-xsmall-v1"
49
 
50
  def load_embeddings(embeddings_name = embeddings_name):
51
 
 
185
  # RUN UI
186
  ###
187
 
188
+ app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)#css=".gradio-container {background-color: black}")
189
 
190
  with app:
191
  ingest_text = gr.State()
 
243
  lines=1,
244
  )
245
  with gr.Row():
246
+ submit = gr.Button(value="Send message", variant="primary", scale = 4)
247
+ clear = gr.Button(value="Clear chat", variant="secondary", scale=1)
248
+ stop = gr.Button(value="Stop generating", variant="secondary", scale=1)
249
 
250
  examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
251
  #value = "What were the five pillars of the previous borough plan?",
 
296
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
297
 
298
  change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
299
+ success(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
300
+ success(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
301
+ success(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
302
+ success(lambda: None, None, chatbot, queue=False)
303
 
304
  # Load in a pdf
305
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
306
+ success(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
307
+ success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
308
+ success(chatf.hide_block, outputs = [examples_set])
309
 
310
  # Load in a webpage
311
  load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
312
+ success(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
313
+ success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
314
+ success(chatf.hide_block, outputs = [examples_set])
315
 
316
  # Load in a csv/excel file
317
  load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
318
+ success(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
319
+ success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
320
+ success(chatf.hide_block, outputs = [examples_set])
321
 
322
  # Load in a webpage
323
 
324
  # Click/enter to send message action
325
  response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
326
+ success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
327
+ success(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], outputs=chatbot)
328
+ response_click.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
329
+ success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
330
+ success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
331
 
332
  response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
333
+ success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
334
+ success(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], chatbot)
335
+ response_enter.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
336
+ success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
337
+ success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
338
 
339
  # Stop box
340
  stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
 
356
  access_callback.setup([session_hash_textbox], access_logs_data_folder)
357
 
358
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
359
+ success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
360
 
361
  # Launch the Gradio app
362
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
chatfuncs/ingest.py CHANGED
@@ -7,6 +7,7 @@ import requests
7
  import pandas as pd
8
  import dateutil.parser
9
  from typing import Type, List
 
10
 
11
  from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
12
  from langchain_community.vectorstores.faiss import FAISS
@@ -573,56 +574,50 @@ def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
573
 
574
  return embeddings_func
575
 
576
- def embed_faiss_save_to_zip(docs_out, save_to="output", model_name = "BAAI/bge-base-en-v1.5"):
577
-
578
  load_embeddings(model_name=model_name)
579
 
580
- #embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
581
-
582
  print(f"> Total split documents: {len(docs_out)}")
583
 
584
  vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
585
 
586
- if not Path(save_to).exists():
587
- os.mkdir(save_to)
588
 
589
- if Path(save_to).exists():
590
- vectorstore.save_local(folder_path=save_to)
591
 
592
- print("> DONE")
593
  print(f"> Saved to: {save_to}")
594
 
595
- ### Save as zip, then remove faiss/pkl files to allow for upload to huggingface
 
 
596
 
597
- import shutil
 
598
 
599
- shutil.make_archive(save_to, 'zip', save_to)
 
 
 
 
600
 
601
- os.remove(save_to + "/index.faiss")
602
- os.remove(save_to + "/index.pkl")
603
 
604
- save_zip_out = save_to + "/" + save_to + '.zip'
 
 
605
 
606
- shutil.move(save_to + '.zip', save_zip_out)
 
607
 
608
- out_message = "Document processing complete"
 
 
 
609
 
610
- return out_message, vectorstore, save_zip_out
611
 
612
- def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
613
- print(f"> Total split documents: {len(docs_out)}")
614
-
615
- vectordb = Chroma.from_documents(documents=docs_out,
616
- embedding=embeddings,
617
- persist_directory=save_to)
618
-
619
- # persiste the db to disk
620
- vectordb.persist()
621
-
622
- print("> DONE")
623
- print(f"> Saved to: {save_to}")
624
-
625
- return vectordb
626
 
627
  def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
628
 
 
7
  import pandas as pd
8
  import dateutil.parser
9
  from typing import Type, List
10
+ import shutil
11
 
12
  from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
13
  from langchain_community.vectorstores.faiss import FAISS
 
574
 
575
  return embeddings_func
576
 
577
+ def embed_faiss_save_to_zip(docs_out, save_to="output", model_name="BAAI/bge-base-en-v1.5"):
 
578
  load_embeddings(model_name=model_name)
579
 
 
 
580
  print(f"> Total split documents: {len(docs_out)}")
581
 
582
  vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
583
 
584
+ save_to_path = Path(save_to)
585
+ save_to_path.mkdir(parents=True, exist_ok=True)
586
 
587
+ vectorstore.save_local(folder_path=str(save_to_path))
 
588
 
589
+ print("> FAISS index saved")
590
  print(f"> Saved to: {save_to}")
591
 
592
+ # Ensure files are written before archiving
593
+ index_faiss = save_to_path / "index.faiss"
594
+ index_pkl = save_to_path / "index.pkl"
595
 
596
+ if not index_faiss.exists() or not index_pkl.exists():
597
+ raise FileNotFoundError("Expected FAISS index files not found before zipping.")
598
 
599
+ # Flush file system writes by forcing a sync (works best on Unix)
600
+ try:
601
+ os.sync()
602
+ except AttributeError:
603
+ pass # os.sync() not available on Windows
604
 
605
+ # Create ZIP archive
606
+ final_zip_path = shutil.make_archive(str(save_to_path), 'zip', root_dir=str(save_to_path))
607
 
608
+ # Remove individual index files to avoid leaking large raw files
609
+ index_faiss.unlink(missing_ok=True)
610
+ index_pkl.unlink(missing_ok=True)
611
 
612
+ # Move ZIP inside the folder for easier reference
613
+ #final_zip_path = save_to_path.with_suffix('.zip')
614
 
615
+ print("> Archive complete")
616
+ print(f"> Final ZIP path: {final_zip_path}")
617
+
618
+ return "Document processing complete", vectorstore, final_zip_path
619
 
 
620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
 
622
  def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
623
 
chatfuncs/ingest_borough_plan.py CHANGED
@@ -8,7 +8,7 @@ print(borough_plan_text)
8
  borough_plan_docs = ing.text_to_docs(borough_plan_text)
9
  print("Borough plan docs created")
10
 
11
- embedding_model = "BAAI/bge-base-en-v1.5"
12
 
13
  embeddings = ing.load_embeddings(model_name = embedding_model)
14
  ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
 
8
  borough_plan_docs = ing.text_to_docs(borough_plan_text)
9
  print("Borough plan docs created")
10
 
11
+ embedding_model = "BAAI/bge-base-en-v1.5" # "mixedbread-ai/mxbai-embed-xsmall-v1" #
12
 
13
  embeddings = ing.load_embeddings(model_name = embedding_model)
14
  ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)